Update on "[quant][graphmode][fx] Support quantization for standalone…

… module" Summary: Sometimes user need to quantize a submodule as one unit, and this submodule will be lowered to a different backend like accelerator. The submodule will be quantized with the same fx based graph mode quantization functions and will be connected with the rest of the model automatically. APIs: ```python class StandaloneModule(torch.nn.Module): def __init__(self): super().__init__() self.conv = torch.nn.Conv2d(1, 1, 1) def forward(self, x): return self.conv(x) class CustomTracer(Tracer): def is_leaf_module(self, m, module_qualified_name): return (m.__module__.startswith('torch.nn') and not isinstance(m, torch.nn.Sequential)) or \ isinstance(m, StandaloneModule) class ModelThatUsesStandaloneModule(...): def __init__(self): super().__init__() self.standalone = StandaloneModule() def forward(self, x): return self.standalone(x) m = ModelThatUsesStandaloneModule() qconfig_dict = {"": qconfig, "standalone_module_name": ["standalone"]} m = prepare_fx(m, qconfig_dict) calibrate(m, data) m = convert_fx(m) m.standalone = lower_to_acclerator(m.standalone) ``` Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: [D23580642](https://our.internmc.facebook.com/intern/diff/D23580642) [ghstack-poisoned]
pytorch · Sep 29, 2020 · 7dd90f4 · 7dd90f4
2 parents 6587a4b + 29670ca
commit 7dd90f4
Show file tree

Hide file tree

Showing 51 changed files with 713 additions and 505 deletions.
diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py
@@ -6,7 +6,7 @@
 import cimodel.lib.conf_tree as conf_tree
 import cimodel.lib.miniutils as miniutils
 from cimodel.data.pytorch_build_data import CONFIG_TREE_DATA, TopLevelNode
-from cimodel.data.simple.util.branch_filters import gen_filter_dict
+from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
 from cimodel.data.simple.util.docker_constants import gen_docker_image
 
 
@@ -110,6 +110,8 @@ def gen_workflow_params(self, phase):
             parameters["resource_class"] = resource_class
         if phase == "build" and self.rocm_version is not None:
             parameters["resource_class"] = "xlarge"
+        if hasattr(self, 'filters'):
+            parameters['filters'] = self.filters
         return parameters
 
     def gen_workflow_job(self, phase):
@@ -139,14 +141,16 @@ def gen_workflow_job(self, phase):
 
 # TODO This is a hack to special case some configs just for the workflow list
 class HiddenConf(object):
-    def __init__(self, name, parent_build=None):
+    def __init__(self, name, parent_build=None, filters=None):
         self.name = name
         self.parent_build = parent_build
+        self.filters = filters
 
     def gen_workflow_job(self, phase):
         return {
             self.gen_build_name(phase): {
-                "requires": [self.parent_build.gen_build_name("build")]
+                "requires": [self.parent_build.gen_build_name("build")],
+                "filters": self.filters,
             }
         }
 
@@ -166,7 +170,8 @@ def gen_workflow_job(self, phase):
                 "branch": self.branch,
                 "requires": [self.parent_build],
                 "context": "org-member",
-                "filters": gen_filter_dict(branches_list=["nightly"])
+                "filters": gen_filter_dict(branches_list=["nightly"],
+                                           tags_list=RC_PATTERN)
             }
         }
 
@@ -205,7 +210,9 @@ def gen_docs_configs(xenial_parent_config):
     configs.append(
         HiddenConf(
             "pytorch_python_doc_build",
-            parent_build=xenial_parent_config
+            parent_build=xenial_parent_config,
+            filters=gen_filter_dict(branches_list=r"/.*/",
+                                    tags_list=RC_PATTERN),
         )
     )
     configs.append(
@@ -219,7 +226,9 @@ def gen_docs_configs(xenial_parent_config):
     configs.append(
         HiddenConf(
             "pytorch_cpp_doc_build",
-            parent_build=xenial_parent_config
+            parent_build=xenial_parent_config,
+            filters=gen_filter_dict(branches_list=r"/.*/",
+                                    tags_list=RC_PATTERN),
         )
     )
     configs.append(
@@ -348,6 +357,8 @@ def instantiate_configs():
 
         # run docs builds on "pytorch-linux-xenial-py3.6-gcc5.4". Docs builds
         # should run on a CPU-only build that runs on all PRs.
+        # XXX should this be updated to a more modern build? Projects are
+        #     beginning to drop python3.6
         if (
             distro_name == "xenial"
             and fc.find_prop("pyver") == "3.6"
@@ -358,6 +369,8 @@ def instantiate_configs():
             and compiler_name == "gcc"
             and fc.find_prop("compiler_version") == "5.4"
         ):
+            c.filters = gen_filter_dict(branches_list=r"/.*/",
+                                        tags_list=RC_PATTERN)
             c.dependent_tests = gen_docs_configs(c)
 
         if cuda_version == "10.2" and python_version == "3.6" and not is_libtorch:

diff --git a/.circleci/cimodel/data/simple/docker_definitions.py b/.circleci/cimodel/data/simple/docker_definitions.py
@@ -1,6 +1,7 @@
 from collections import OrderedDict
 
 from cimodel.lib.miniutils import quote
+from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
 
 
 # TODO: make this generated from a matrix rather than just a static list
@@ -24,7 +25,7 @@
     "pytorch-linux-xenial-py3.8",
     "pytorch-linux-xenial-py3.6-clang7",
     "pytorch-linux-xenial-py3.6-gcc4.8",
-    "pytorch-linux-xenial-py3.6-gcc5.4",
+    "pytorch-linux-xenial-py3.6-gcc5.4",  # this one is used in doc builds
     "pytorch-linux-xenial-py3.6-gcc7.2",
     "pytorch-linux-xenial-py3.6-gcc7",
     "pytorch-linux-bionic-rocm3.7-py3.6",
@@ -34,16 +35,20 @@
 
 def get_workflow_jobs():
     """Generates a list of docker image build definitions"""
-    return [
-        OrderedDict(
+    ret = []
+    for image_name in IMAGE_NAMES:
+        parameters = OrderedDict({
+            "name": quote(f"docker-{image_name}"),
+            "image_name": quote(image_name),
+        }) 
+        if image_name == "pytorch-linux-xenial-py3.6-gcc5.4":
+            # pushing documentation on tags requires CircleCI to also
+            # build all the dependencies on tags, including this docker image
+            parameters['filters'] = gen_filter_dict(branches_list=r"/.*/",
+                                                    tags_list=RC_PATTERN)
+        ret.append(OrderedDict(
             {
-                "docker_build_job": OrderedDict(
-                    {
-                        "name": quote(f"docker-{image_name}"),
-                        "image_name": quote(image_name),
-                    }
-                )
+                "docker_build_job": parameters
             }
-        )
-        for image_name in IMAGE_NAMES
-    ]
+        ))
+    return ret
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -1188,10 +1188,13 @@ jobs:
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+          tag=${CIRCLE_TAG:1:5}
+          target=${tag:-master}
+          echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/master master site") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/'$target' master site") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -1230,10 +1233,13 @@ jobs:
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+          tag=${CIRCLE_TAG:1:5}
+          target=${tag:-master}
+          echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" master") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -6432,6 +6438,11 @@ workflows:
       - docker_build_job:
           name: "docker-pytorch-linux-xenial-py3.6-gcc5.4"
           image_name: "pytorch-linux-xenial-py3.6-gcc5.4"
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
       - docker_build_job:
           name: "docker-pytorch-linux-xenial-py3.6-gcc7.2"
           image_name: "pytorch-linux-xenial-py3.6-gcc7.2"
@@ -6450,14 +6461,29 @@ workflows:
             - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
           build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-build"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
       - pytorch_linux_test:
           name: pytorch_linux_xenial_py3_6_gcc5_4_test
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
           build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
           resource_class: large
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
       - pytorch_python_doc_build:
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
       - pytorch_doc_push:
@@ -6467,10 +6493,17 @@ workflows:
             branches:
               only:
                 - nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: pytorch_python_doc_push
           requires:
             - pytorch_python_doc_build
       - pytorch_cpp_doc_build:
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
       - pytorch_doc_push:
@@ -6480,6 +6513,8 @@ workflows:
             branches:
               only:
                 - nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: pytorch_cpp_doc_push
           requires:
             - pytorch_cpp_doc_build

diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@@ -43,10 +43,13 @@
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+          tag=${CIRCLE_TAG:1:5}
+          target=${tag:-master}
+          echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/master master site") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/'$target' master site") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -85,10 +88,13 @@
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+          tag=${CIRCLE_TAG:1:5}
+          target=${tag:-master}
+          echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" master") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 

diff --git a/BUILD.bazel b/BUILD.bazel
@@ -55,6 +55,7 @@ cc_library(
         "c10/cuda/*.h",
         "c10/cuda/impl/*.h",
         "c10/macros/*.h",
+        "c10/mobile/*.h",
         "c10/util/*.h",
         "c10/util/*.hpp",
     ]),
@@ -71,6 +72,7 @@ cc_library(
     srcs = glob([
         "c10/core/*.cpp",
         "c10/core/impl/*.cpp",
+        "c10/mobile/*.cpp",
         "c10/util/*.cpp",
     ]) + if_cuda(
         glob([

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -64,6 +64,11 @@ bool Context::deterministic() const {
 }
 
 void Context::setDeterministic(bool b) {
+  if (b) {
+    TORCH_WARN_ONCE("torch.set_deterministic is in beta, and its design and "
+      " functionality may change in the future.");
+  }
+
   _deterministic = b;
 }
 

diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h
@@ -104,6 +104,8 @@ class Vec256<int64_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int64_t tmp_values[size()];
@@ -228,6 +230,8 @@ class Vec256<int32_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int32_t tmp_values[size()];
@@ -449,6 +453,8 @@ class Vec256<int16_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int16_t tmp_values[size()];
@@ -699,6 +705,8 @@ class Vec256<int8_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int8_t tmp_values[size()];
@@ -879,8 +887,8 @@ Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t>
 
 template <typename T, typename Op>
 Vec256<T> inline int_elementwise_binary_256(const Vec256<T>& a, const Vec256<T>& b, Op op) {
-  __at_align32__ T values_a[Vec256<T>::size()];
-  __at_align32__ T values_b[Vec256<T>::size()];
+  T values_a[Vec256<T>::size()];
+  T values_b[Vec256<T>::size()];
   a.store(values_a);
   b.store(values_b);
   for (int i = 0; i != Vec256<T>::size(); i++) {

diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -238,9 +238,9 @@ static void logical_not_kernel(TensorIterator& iter) {
   // NOTE: this implementation differs from the CUDA implementation which only does single dispatch
   // (to avoid expensive compilation) because CPU kernels don't handle dynamic_casting
   // (see needs_dynamic_casting).
-  AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(1), "logical_not_cpu", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(1), "logical_not_cpu", [&]() {
     using self_t = scalar_t;
-    AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(0), "logical_not_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(0), "logical_not_cpu", [&]() {
       cpu_kernel(iter, [](self_t a) -> scalar_t { return static_cast<scalar_t>(!a); });
     });
   });

diff --git a/aten/src/ATen/native/cuda/UnarySignKernels.cu b/aten/src/ATen/native/cuda/UnarySignKernels.cu
@@ -11,11 +11,11 @@
 namespace at { namespace native {
 
 void logical_not_kernel_cuda(TensorIterator& iter) {
-  // error check -- this is just ensuring we don't dispatch on types that aren't in ALL_TYPES_AND2(...)
+  // error check -- this is just ensuring we don't dispatch on types that aren't in ALL_TYPES_AND_COMPLEX_AND3(...)
   // so we don't have to maintain a separate list or to do double dispatch.
-  AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(0), "logical_not_cuda", [&]() {});
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(0), "logical_not_cuda", [&]() {});
 
-  AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(1), "logical_not_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(1), "logical_not_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return !a; });
   });
 }