From 42b5601f30a4e6fd02c4a0173c4bebdc24e002ee Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Tue, 22 Dec 2020 08:39:17 -0800
Subject: [PATCH 01/45] [ROCm] add 4.0 to nightly builds (#49632)

Summary:
Depends on https://github.com/pytorch/builder/pull/614.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49632

Reviewed By: ngimel

Differential Revision: D25665880

Pulled By: walterddr

fbshipit-source-id: b37a55b7e3028648453b422683fa4a72e0ee04a4
---
 .circleci/cimodel/data/dimensions.py |   2 +-
 .circleci/config.yml                 | 208 +++++++++++++--------------
 2 files changed, 105 insertions(+), 105 deletions(-)

diff --git a/.circleci/cimodel/data/dimensions.py b/.circleci/cimodel/data/dimensions.py
index c9aab39ddd2a..450dd573f023 100644
--- a/.circleci/cimodel/data/dimensions.py
+++ b/.circleci/cimodel/data/dimensions.py
@@ -8,8 +8,8 @@
 ]
 
 ROCM_VERSIONS = [
-    "3.9",
     "3.10",
+    "4.0",
 ]
 
 ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS]
diff --git a/.circleci/config.yml b/.circleci/config.yml
index d8b44cb13ea2..0716e516518b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -2162,8 +2162,8 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/manylinux-cuda110"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_build
-          build_environment: "manywheel 3.6m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_build
+          build_environment: "manywheel 3.6m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -2171,10 +2171,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_build
-          build_environment: "manywheel 3.7m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_build
+          build_environment: "manywheel 3.7m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -2182,10 +2182,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_build
-          build_environment: "manywheel 3.8m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_build
+          build_environment: "manywheel 3.8m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -2193,10 +2193,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_build
-          build_environment: "manywheel 3.9m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_build
+          build_environment: "manywheel 3.9m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -2204,10 +2204,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_build
-          build_environment: "manywheel 3.6m rocm3.10 devtoolset7"
+          name: binary_linux_manywheel_3_6m_rocm4_0_devtoolset7_nightly_build
+          build_environment: "manywheel 3.6m rocm4.0 devtoolset7"
           filters:
             branches:
               only:
@@ -2215,10 +2215,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.10"
+          docker_image: "pytorch/manylinux-rocm:4.0"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_build
-          build_environment: "manywheel 3.7m rocm3.10 devtoolset7"
+          name: binary_linux_manywheel_3_7m_rocm4_0_devtoolset7_nightly_build
+          build_environment: "manywheel 3.7m rocm4.0 devtoolset7"
           filters:
             branches:
               only:
@@ -2226,10 +2226,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.10"
+          docker_image: "pytorch/manylinux-rocm:4.0"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_build
-          build_environment: "manywheel 3.8m rocm3.10 devtoolset7"
+          name: binary_linux_manywheel_3_8m_rocm4_0_devtoolset7_nightly_build
+          build_environment: "manywheel 3.8m rocm4.0 devtoolset7"
           filters:
             branches:
               only:
@@ -2237,10 +2237,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.10"
+          docker_image: "pytorch/manylinux-rocm:4.0"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_build
-          build_environment: "manywheel 3.9m rocm3.10 devtoolset7"
+          name: binary_linux_manywheel_3_9m_rocm4_0_devtoolset7_nightly_build
+          build_environment: "manywheel 3.9m rocm4.0 devtoolset7"
           filters:
             branches:
               only:
@@ -2248,7 +2248,7 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.10"
+          docker_image: "pytorch/manylinux-rocm:4.0"
       - binary_linux_build:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_build
           build_environment: "conda 3.6 cpu devtoolset7"
@@ -3732,8 +3732,8 @@ workflows:
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_test
-          build_environment: "manywheel 3.6m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_test
+          build_environment: "manywheel 3.6m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -3742,13 +3742,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.9"
+            - binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_test
-          build_environment: "manywheel 3.7m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_test
+          build_environment: "manywheel 3.7m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -3757,13 +3757,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.9"
+            - binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_test
-          build_environment: "manywheel 3.8m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_test
+          build_environment: "manywheel 3.8m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -3772,13 +3772,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.9"
+            - binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_test
-          build_environment: "manywheel 3.9m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_test
+          build_environment: "manywheel 3.9m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -3787,13 +3787,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.9"
+            - binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_test
-          build_environment: "manywheel 3.6m rocm3.10 devtoolset7"
+          name: binary_linux_manywheel_3_6m_rocm4_0_devtoolset7_nightly_test
+          build_environment: "manywheel 3.6m rocm4.0 devtoolset7"
           filters:
             branches:
               only:
@@ -3802,13 +3802,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.10"
+            - binary_linux_manywheel_3_6m_rocm4_0_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:4.0"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_test
-          build_environment: "manywheel 3.7m rocm3.10 devtoolset7"
+          name: binary_linux_manywheel_3_7m_rocm4_0_devtoolset7_nightly_test
+          build_environment: "manywheel 3.7m rocm4.0 devtoolset7"
           filters:
             branches:
               only:
@@ -3817,13 +3817,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.10"
+            - binary_linux_manywheel_3_7m_rocm4_0_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:4.0"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_test
-          build_environment: "manywheel 3.8m rocm3.10 devtoolset7"
+          name: binary_linux_manywheel_3_8m_rocm4_0_devtoolset7_nightly_test
+          build_environment: "manywheel 3.8m rocm4.0 devtoolset7"
           filters:
             branches:
               only:
@@ -3832,13 +3832,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.10"
+            - binary_linux_manywheel_3_8m_rocm4_0_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:4.0"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_test
-          build_environment: "manywheel 3.9m rocm3.10 devtoolset7"
+          name: binary_linux_manywheel_3_9m_rocm4_0_devtoolset7_nightly_test
+          build_environment: "manywheel 3.9m rocm4.0 devtoolset7"
           filters:
             branches:
               only:
@@ -3847,8 +3847,8 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.10"
+            - binary_linux_manywheel_3_9m_rocm4_0_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:4.0"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
@@ -5558,10 +5558,10 @@ workflows:
           package_type: manywheel
           upload_subfolder: cu110
       - binary_upload:
-          name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5570,12 +5570,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.9
+          upload_subfolder: rocm3.10
       - binary_upload:
-          name: binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5584,12 +5584,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.9
+          upload_subfolder: rocm3.10
       - binary_upload:
-          name: binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5598,12 +5598,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.9
+          upload_subfolder: rocm3.10
       - binary_upload:
-          name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5612,12 +5612,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.9
+          upload_subfolder: rocm3.10
       - binary_upload:
-          name: binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_6m_rocm4_0_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_6m_rocm4_0_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5626,12 +5626,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.10
+          upload_subfolder: rocm4.0
       - binary_upload:
-          name: binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_7m_rocm4_0_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_7m_rocm4_0_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5640,12 +5640,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.10
+          upload_subfolder: rocm4.0
       - binary_upload:
-          name: binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_8m_rocm4_0_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_8m_rocm4_0_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5654,12 +5654,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.10
+          upload_subfolder: rocm4.0
       - binary_upload:
-          name: binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_9m_rocm4_0_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_9m_rocm4_0_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5668,7 +5668,7 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.10
+          upload_subfolder: rocm4.0
       - binary_upload:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_upload
           context: org-member
@@ -8460,99 +8460,99 @@ workflows:
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly
-          build_environment: "manywheel 3.6m rocm3.9 devtoolset7"
+          name: smoke_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly
+          build_environment: "manywheel 3.6m rocm3.10 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly
-          build_environment: "manywheel 3.7m rocm3.9 devtoolset7"
+          name: smoke_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly
+          build_environment: "manywheel 3.7m rocm3.10 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly
-          build_environment: "manywheel 3.8m rocm3.9 devtoolset7"
+          name: smoke_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly
+          build_environment: "manywheel 3.8m rocm3.10 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly
-          build_environment: "manywheel 3.9m rocm3.9 devtoolset7"
+          name: smoke_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly
+          build_environment: "manywheel 3.9m rocm3.10 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly
-          build_environment: "manywheel 3.6m rocm3.10 devtoolset7"
+          name: smoke_linux_manywheel_3_6m_rocm4_0_devtoolset7_nightly
+          build_environment: "manywheel 3.6m rocm4.0 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.10"
+          docker_image: "pytorch/manylinux-rocm:4.0"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly
-          build_environment: "manywheel 3.7m rocm3.10 devtoolset7"
+          name: smoke_linux_manywheel_3_7m_rocm4_0_devtoolset7_nightly
+          build_environment: "manywheel 3.7m rocm4.0 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.10"
+          docker_image: "pytorch/manylinux-rocm:4.0"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly
-          build_environment: "manywheel 3.8m rocm3.10 devtoolset7"
+          name: smoke_linux_manywheel_3_8m_rocm4_0_devtoolset7_nightly
+          build_environment: "manywheel 3.8m rocm4.0 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.10"
+          docker_image: "pytorch/manylinux-rocm:4.0"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly
-          build_environment: "manywheel 3.9m rocm3.10 devtoolset7"
+          name: smoke_linux_manywheel_3_9m_rocm4_0_devtoolset7_nightly
+          build_environment: "manywheel 3.9m rocm4.0 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.10"
+          docker_image: "pytorch/manylinux-rocm:4.0"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:

From 7b4a7661d6de659c8423015a2f3e93308eb83850 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nikita.shulga@gmail.com>
Date: Tue, 22 Dec 2020 09:31:07 -0800
Subject: [PATCH 02/45] Make PyTorch partially cross-compilable for Apple M1
 (#49701)

Summary:
Update CPUINFO to include https://github.com/pytorch/cpuinfo/pull/51
Update sleef to include https://github.com/shibatch/sleef/pull/376
Modify aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt to recognize CMAKE_OSX_ARCHITECTURES

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49701

Test Plan: `cmake -DCMAKE_OSX_ARCHITECTURES=x86_64 -DPYTHON_EXECUTABLE=/usr/bin/python3  -DUSE_XNNPACK=NO -DBUILD_TEST=YES .. -G Ninja; ninja basic` finishes successfully on Apple M1

Reviewed By: janeyx99

Differential Revision: D25669219

Pulled By: malfet

fbshipit-source-id: 5ee36b64e3a7ac76448f2a300ac4993375a26de5
---
 .../native/quantized/cpu/qnnpack/CMakeLists.txt    | 14 ++++++++++----
 cmake/Modules/FindARM.cmake                        |  2 +-
 third_party/XNNPACK                                |  2 +-
 third_party/cpuinfo                                |  2 +-
 third_party/sleef                                  |  2 +-
 5 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt b/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt
index 99bf8ba07074..01c815139de3 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt
@@ -21,6 +21,12 @@ option(PYTORCH_QNNPACK_BUILD_BENCHMARKS "Build QNNPACK benchmarks" ON)
 # Enable runtime requantization.
 add_definitions(-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION=1)
 
+# ---[ Target processor
+SET(PYTORCH_QNNPACK_TARGET_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}")
+IF(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
+  SET(PYTORCH_QNNPACK_TARGET_PROCESSOR "${CMAKE_OSX_ARCHITECTURES}")
+ENDIF()
+
 # ---[ CMake options
 if(PYTORCH_QNNPACK_BUILD_TESTS)
   enable_testing()
@@ -244,11 +250,11 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv[5-8]" OR IOS_ARCH MATCHES "^armv7")
   list(APPEND PYTORCH_QNNPACK_UKERNELS ${PYTORCH_QNNPACK_ARM_NEON_UKERNELS})
   list(APPEND PYTORCH_QNNPACK_UKERNELS ${PYTORCH_QNNPACK_AARCH32_ASM_UKERNELS})
 endif()
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64)$" OR IOS_ARCH MATCHES "^arm64.*")
+if(PYTORCH_QNNPACK_TARGET_PROCESSOR MATCHES "^(aarch64|arm64)$" OR IOS_ARCH MATCHES "^arm64.*")
   list(APPEND PYTORCH_QNNPACK_UKERNELS ${PYTORCH_QNNPACK_ARM_NEON_UKERNELS})
   list(APPEND PYTORCH_QNNPACK_UKERNELS ${PYTORCH_QNNPACK_AARCH64_ASM_UKERNELS})
 endif()
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i[3-6]86|x86_64)$" OR IOS_ARCH MATCHES "^(i386|x86_64)$")
+if(PYTORCH_QNNPACK_TARGET_PROCESSOR MATCHES "^(i[3-6]86|x86_64)$" OR IOS_ARCH MATCHES "^(i386|x86_64)$")
   list(APPEND PYTORCH_QNNPACK_UKERNELS ${PYTORCH_QNNPACK_X86_SSE2_UKERNELS})
 endif()
 
@@ -271,13 +277,13 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv[5-8]" OR IOS_ARCH MATCHES "^armv7")
     set_property(SOURCE ${PYTORCH_QNNPACK_AARCH32_ASM_UKERNELS} APPEND_STRING PROPERTY COMPILE_FLAGS " -arch ${IOS_ARCH} ")
   endif()
 endif()
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64)$" OR IOS_ARCH MATCHES "^arm64.*")
+if(PYTORCH_QNNPACK_TARGET_PROCESSOR MATCHES "^(aarch64|arm64)$" OR IOS_ARCH MATCHES "^arm64.*")
   set_property(SOURCE ${PYTORCH_QNNPACK_ARM_NEON_UKERNELS} APPEND_STRING PROPERTY COMPILE_FLAGS " -O2 ")
   if(IOS)
     set_property(SOURCE ${PYTORCH_QNNPACK_AARCH64_ASM_UKERNELS} APPEND_STRING PROPERTY COMPILE_FLAGS " -arch ${IOS_ARCH} ")
   endif()
 endif()
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i[3-6]86|x86_64)$" OR IOS_ARCH MATCHES "^(i386|x86_64)$")
+if(PYTORCH_QNNPACK_TARGET_PROCESSOR MATCHES "^(i[3-6]86|x86_64)$" OR IOS_ARCH MATCHES "^(i386|x86_64)$")
   set_property(SOURCE ${PYTORCH_QNNPACK_X86_SSE2_UKERNELS} APPEND_STRING PROPERTY COMPILE_FLAGS " -O2 -msse2 ")
 endif()
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv[5-8]" OR IOS_ARCH MATCHES "^armv7")
diff --git a/cmake/Modules/FindARM.cmake b/cmake/Modules/FindARM.cmake
index acd00cfa6772..2e5508716035 100644
--- a/cmake/Modules/FindARM.cmake
+++ b/cmake/Modules/FindARM.cmake
@@ -41,7 +41,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
    ENDIF (OMAP4_TRUE)
 
 ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
-   IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+   IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" AND NOT CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64")
       set(NEON_FOUND true CACHE BOOL "NEON available on ARM64")
    ENDIF()
    EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE
diff --git a/third_party/XNNPACK b/third_party/XNNPACK
index 3cf9366ab0d5..e1ffe154593a 160000
--- a/third_party/XNNPACK
+++ b/third_party/XNNPACK
@@ -1 +1 @@
-Subproject commit 3cf9366ab0d5e61e3cf049f794ee7ac122bba937
+Subproject commit e1ffe154593a2e6714d3d2370739cf6fea1055c6
diff --git a/third_party/cpuinfo b/third_party/cpuinfo
index ed8b86a25380..5916273f79a2 160000
--- a/third_party/cpuinfo
+++ b/third_party/cpuinfo
@@ -1 +1 @@
-Subproject commit ed8b86a253800bafdb7b25c5c399f91bff9cb1f3
+Subproject commit 5916273f79a21551890fd3d56fc5375a78d1598d
diff --git a/third_party/sleef b/third_party/sleef
index f66b143ae2aa..e0a003ee838b 160000
--- a/third_party/sleef
+++ b/third_party/sleef
@@ -1 +1 @@
-Subproject commit f66b143ae2aa1c259ef6cfb6048c9549d49bd5e9
+Subproject commit e0a003ee838b75d11763aa9c3ef17bf71a725bff

From eabe05ab7272131e23ff431a12b0ead6848e0c07 Mon Sep 17 00:00:00 2001
From: Oleg Khabinov <khabinov@fb.com>
Date: Tue, 22 Dec 2020 12:00:45 -0800
Subject: [PATCH 03/45] [onnxifi] Get rid of class member (#49380)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49380

Couldn't resist removing a class member that is only used in one function.

Reviewed By: yinghai

Differential Revision: D25547366

fbshipit-source-id: 74e61c6a0068566fb7956380862999163e7e94bf
---
 caffe2/opt/onnxifi_op.cc | 15 +++++++++------
 caffe2/opt/onnxifi_op.h  | 13 +++++++------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/caffe2/opt/onnxifi_op.cc b/caffe2/opt/onnxifi_op.cc
index e22a297b0dd4..624e91f3780f 100644
--- a/caffe2/opt/onnxifi_op.cc
+++ b/caffe2/opt/onnxifi_op.cc
@@ -457,8 +457,10 @@ void OnnxifiOp<CPUContext>::adjustOutputBatchSizes(int current_batch_size) {
 }
 
 template <>
-void OnnxifiOp<CPUContext>::setOutputShapeAndType(int output_idx) {
-  tensor_dims_int64_.clear();
+void OnnxifiOp<CPUContext>::setOutputShapeAndType(
+    int output_idx,
+    c10::SmallVector<int64_t, 4>& tensor_dims_int64) {
+  tensor_dims_int64.clear();
   std::vector<size_t> tensor_dims;
   uint64_t type = ONNXIFI_DATATYPE_FLOAT32;
   const auto it = output_shape_hints_.find(output_idx);
@@ -484,14 +486,14 @@ void OnnxifiOp<CPUContext>::setOutputShapeAndType(int output_idx) {
   std::copy(
       tensor_dims.cbegin(),
       tensor_dims.cend(),
-      std::back_inserter(tensor_dims_int64_));
+      std::back_inserter(tensor_dims_int64));
 
   // Setup the output C2 tensor
   if (!info.quantized) {
     // Normal Tensor
     auto* output_tensor = Output(
         output_idx,
-        tensor_dims_int64_,
+        tensor_dims_int64,
         at::dtype(OnnxifiTypeToDataType(type)).device(CPU));
     setOutputTensorDescriptorTypeAndBuffer(
         type, output_tensor, &tensor_descriptor);
@@ -499,7 +501,7 @@ void OnnxifiOp<CPUContext>::setOutputShapeAndType(int output_idx) {
     // single quantizer, output Int8Tensor
     auto* output_tensor =
         this->template Output<int8::Int8TensorCPU>(output_idx);
-    output_tensor->t.Resize(tensor_dims_int64_);
+    output_tensor->t.Resize(tensor_dims_int64);
     setOutputTensorDescriptorTypeAndBuffer(
         type, &output_tensor->t, &tensor_descriptor);
     tensor_descriptor.quantizationParams = 1;
@@ -542,8 +544,9 @@ bool OnnxifiOp<CPUContext>::RunOnDevice() {
   }
 
   CAFFE_ENFORCE_EQ(output_desc_.size(), OutputSize());
+  c10::SmallVector<int64_t, 4> tensor_dims_int64;
   for (unsigned i = 0U; i < OutputSize(); ++i) {
-    setOutputShapeAndType(i);
+    setOutputShapeAndType(i, tensor_dims_int64);
   }
   bool ext_supported = false;
   onnxMemoryFenceV1 input_fence;
diff --git a/caffe2/opt/onnxifi_op.h b/caffe2/opt/onnxifi_op.h
index ce732f7604bc..caffae632827 100644
--- a/caffe2/opt/onnxifi_op.h
+++ b/caffe2/opt/onnxifi_op.h
@@ -196,7 +196,13 @@ class OnnxifiOp final : public Operator<Context> {
   }
 #endif
  private:
-  void setOutputShapeAndType(int output_idx);
+  // Second argument is a cache vector to avoid repeated reallocation.
+  // The existence of this is not ideal, which is purely due to the fact that
+  // we use int64_t for c2::tensor dim but uint64_t for onnxDesciptor dim.
+  // Maybe we should just use int64_t.
+  void setOutputShapeAndType(
+      int output_idx,
+      c10::SmallVector<int64_t, 4>& tensor_dims_int64);
 
   void buildPropertyList(
       const OperatorDef& /* unused */,
@@ -474,11 +480,6 @@ class OnnxifiOp final : public Operator<Context> {
   // Indicate if i-th output is a quantized tensor
   std::vector<bool> quantized_outputs_;
 
-  // A cache vector to avoid repeated reallocation. The existence of this is not
-  // ideal, which is purely due to the factor that we use int64_t for c2::tensor
-  // dim but uint64_t for onnxDesciptor dim. Maybe we should just use int64_t
-  c10::SmallVector<int64_t, 4> tensor_dims_int64_;
-
   // This is for multi group quantization info
   std::vector<std::vector<float>> all_scales_;
   std::vector<std::vector<int32_t>> all_offsets_;

From c23808d8e87aa8ebda39f6960daceb2a51019196 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Tue, 22 Dec 2020 12:07:00 -0800
Subject: [PATCH 04/45] Reland: Add base forward grad logic (#49734)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49734

RFC: https://github.com/pytorch/rfcs/pull/11

This PR add the basic logic to handle forward grad as dual Tensors.
It contains the following:
- Mechanism to save dual state on a Tensor and clear it up when the dual level ends
- C++ and python user facing API
- Updated view system that is able to track both forward and backward views

The current PR has the following limitations:
- Extensive tests are in the next PR in the stack as formulas are needed to write full tests.
- Only the manual formulas have been audited and no other formula is actually implemented here (they are in the next PR in the stack)
- Only level 0 is allowed for now. This was discussed and agreed that it is not needed for the first version of this PR.
- We can save one ViewInfo creation when both the forward and backward views have the same base. This can be done by adding a boolean flag to the DifferentiableViewMeta and extra logic in the `as_view` method. This is left out to keep this PR concise.
- We can skip tracking forward views if the base has a forward grad. This can be done by adding extra logic in the `as_view` method. This is left out to keep this PR concise.

Reading guide:
- Updated view handling in [gen_variable_type.py](https://github.com/pytorch/pytorch/pull/49097/files#diff-f6553cec68caeaea36f6c8b14ff76a6d39dfd774e0ea9ef2f76e8d81fd9af5df), [VariableTypeUtils.h](https://github.com/pytorch/pytorch/pull/49097/files#diff-ec71cfa45954dece1236c661d170e6341879c5be637f4abf52e826d61b40695a), [variable.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-60e3bfe444e89efc7149f25b38e472710525984789934ab83f1bd5671b8ff285) (skip code below "[Forward Grad View]" for now), [variable.h](https://github.com/pytorch/pytorch/pull/49097/files#diff-1604bcd0e4350ed99ec45e437cee7ac9ebe337392c9ea16a236247aeeb35b02bR266-R542) and [custom_function.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-dd85f452082b5bb6612bbc12adb496f8827defa228509f7b493de1d517522d5d). This introduces the new ViewInfo to hold view informations shared for forward and backward. It also updates the differentiable view meta to use this. And it updates the as_view function to handle both forward and backward view.
- New forward grad class that handle storing gradients and tracking at each level [forward_grad.h](https://github.com/pytorch/pytorch/pull/49097/files#diff-c6c5b9ab2d7e5dde4102495faa1b6bbbfc23aa3e47deb7359c0bfe1eb004c0cb), [forward_grad.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-de2ab54ade7312701850d71a119a4f4ee4b9fc5a9c42a467cdd4e73c033531dd) and [build_variables.bzl](https://github.com/pytorch/pytorch/pull/49097/files#diff-dfdfa2efb17beddfd9094524f95351fd197db6c8857e96b436fb599870359325). EDIT: These files also contain the new flag to globally disable forward AD that allows us to reduce performance issues while this is in development.
- Lowest level API and binding between Tensor and AutogradMeta in [TensorBody.h](https://github.com/pytorch/pytorch/pull/49097/files#diff-7554853205392fa743357bf845ecc350a974ec049383248c12daaf2f4de04911), [TensorImpl.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-052bd9150ef8e09289ddf644b5a6830ede49207201cd41728f6d7cc6d9cead94), [TensorImpl.h](https://github.com/pytorch/pytorch/pull/49097/files#diff-a15aae4cf23da44970db7cece62ff981265575c798c62f7b52d87c8809dfe2e1) and the rest of [variable.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-60e3bfe444e89efc7149f25b38e472710525984789934ab83f1bd5671b8ff285R557-R677)
- API to access the forward primal that needs to be a differentiable function (and so in native_functions.yaml) [native_functions.yaml](https://github.com/pytorch/pytorch/pull/49097/files#diff-2f3dbd85efb9b5172f2264eedd3be47dd765e6ab7cc8bf3ade5e62c28ae35991) [NamedRegistrations.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-69bd3bea510c9b64e1633fa18c3ea63d4b8348dbad3a78ad9de844ab3e43dc1d), [VariableMethodsStub.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-23f5fcb737a2b289811fe0f4b65aef775e7c824b2e629ecd343df51405cd434f), [derivatives.yaml](https://github.com/pytorch/pytorch/pull/49097/files#diff-e4c2f99a2404e98c3586e07425da73008f36b1bada790648a7297af141d37f8c), [gen_python_functions.py](https://github.com/pytorch/pytorch/pull/49097/files#diff-e4c2f99a2404e98c3586e07425da73008f36b1bada790648a7297af141d37f8c), [gen_trace_type.py](https://github.com/pytorch/pytorch/pull/49097/files#diff-54e0b976027bf8debefb959ff360b89ae93466970c843365b1b3a03806d868ce), [TraceTypeManual.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-f34636741ad4a23d018e0c289bc750c3bad887b45660e1d6eaf440d234a78fbf) and [part of VariableTypeManual.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-6e19a1bce8cbdba8714b6e2c794a76bc0864b64a49cfa757cb0b5afdc937d1a4R198-R243)
- c++ API [autograd.h](https://github.com/pytorch/pytorch/pull/49097/files#diff-349028fbe8291a965a7a263c323b208fe071c35c66179ee997ef84fa81aa4b1e), [autograd.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-a3fe908d67dfec16a1fcde300de68b0701bf68b88db7451f29f2bee255cf30c9)
- python binding [init.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-c58a67c85191c22c9b3bb439117d8053edfd9dea839fa010cf967d404c3c630d)
- python API [forward_ad.py](https://github.com/pytorch/pytorch/pull/49097/files#diff-a4efad4ba18fffdfb264c21e5475997a24a743089a899f8ec1a5ff962c6738d9), [autograd/__init__.py](https://github.com/pytorch/pytorch/pull/49097/files#diff-743abcafd32ad0e69f39ac5a91df4197b7e1921c135cacee7ef6dc829a8a7af8)
- c++ and python printing [Formatting.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-881dba501e71662e2e4818b4b016f739b344c8aed2f5edc6b871eda47a2aced0), [_tensor_str.py](https://github.com/pytorch/pytorch/pull/49097/files#diff-a7911f8d5e73adbff914d99fd7818ace2a7030b6a3748abe06ec6fc6e3df9cc3)
- Utility for formulas and updated manual functions to respect new view system as well as forward grad [FunctionsManual.h](https://github.com/pytorch/pytorch/pull/49097/files#diff-6378bb6dc81a64dab676d61731341fa5d1088418f32a1473a33a0ccfc2357dc1), [FunctionsManual.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-4adbd88239afcd60e8198aab65d4f5e43b62314e34b80551e997a1ea503adea5) [rest of VariableTypeManual.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-6e19a1bce8cbdba8714b6e2c794a76bc0864b64a49cfa757cb0b5afdc937d1a4R264-R433)
- Ensure SavedVariable save forward grad properly [saved_variable.h](https://github.com/pytorch/pytorch/pull/49097/files#diff-c1b8039d776241abe177d5aa99b79dd9489a9b3e529da8ab24c2e386c1238ae2), [saved_variable.cpp](https://github.com/pytorch/pytorch/pull/49097/files#diff-cc9fba479b5beae06b2eea2e390d17796e0341c5b037a20b5bcaccbb0c341030)

Test Plan: Imported from OSS

Reviewed By: gchanan

Differential Revision: D25678797

Pulled By: albanD

fbshipit-source-id: 3d58550c11b5f58b9b73fd30596d042b857fb9dd
---
 aten/src/ATen/core/Formatting.cpp            |   5 +
 aten/src/ATen/core/NamedRegistrations.cpp    |   1 +
 aten/src/ATen/native/AutogradComposite.cpp   |  27 +++
 aten/src/ATen/native/VariableMethodStubs.cpp |   4 +
 aten/src/ATen/native/native_functions.yaml   |  14 ++
 aten/src/ATen/templates/TensorBody.h         |  17 ++
 c10/core/TensorImpl.cpp                      |  11 +
 c10/core/TensorImpl.h                        |  38 ++++
 test/test_autograd.py                        |  81 +++++++
 test/test_namedtuple_return_api.py           |   7 +-
 tools/autograd/gen_python_functions.py       |   3 +-
 tools/autograd/gen_trace_type.py             |   2 +-
 tools/autograd/gen_variable_type.py          |  10 +-
 tools/build_variables.bzl                    |   2 +
 torch/_C/__init__.pyi.in                     |   6 +
 torch/_tensor_str.py                         |  20 +-
 torch/autograd/__init__.py                   |   1 +
 torch/autograd/forward_ad.py                 | 116 ++++++++++
 torch/csrc/autograd/FunctionsManual.cpp      |  12 +
 torch/csrc/autograd/FunctionsManual.h        |   4 +
 torch/csrc/autograd/TraceTypeManual.cpp      |   1 +
 torch/csrc/autograd/VariableTypeManual.cpp   |  86 +++++++-
 torch/csrc/autograd/VariableTypeUtils.h      | 161 ++++++++------
 torch/csrc/autograd/autograd.cpp             |  13 ++
 torch/csrc/autograd/autograd.h               |  15 ++
 torch/csrc/autograd/autograd_meta.cpp        | 218 +++++++++++++++++++
 torch/csrc/autograd/custom_function.cpp      |   4 +-
 torch/csrc/autograd/forward_grad.cpp         |  90 ++++++++
 torch/csrc/autograd/forward_grad.h           | 193 ++++++++++++++++
 torch/csrc/autograd/functions/basic_ops.cpp  |   4 +
 torch/csrc/autograd/functions/basic_ops.h    |   4 +
 torch/csrc/autograd/init.cpp                 |  50 +++++
 torch/csrc/autograd/saved_variable.cpp       |  16 ++
 torch/csrc/autograd/saved_variable.h         |  12 +
 torch/csrc/autograd/variable.cpp             | 145 ++++++++----
 torch/csrc/autograd/variable.h               | 200 ++++++++++++++---
 torch/overrides.py                           |   2 +
 37 files changed, 1442 insertions(+), 153 deletions(-)
 create mode 100644 aten/src/ATen/native/AutogradComposite.cpp
 create mode 100644 torch/autograd/forward_ad.py
 create mode 100644 torch/csrc/autograd/autograd_meta.cpp
 create mode 100644 torch/csrc/autograd/forward_grad.cpp
 create mode 100644 torch/csrc/autograd/forward_grad.h

diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
index 219dc857f2a1..eb124dab6874 100644
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@@ -292,6 +292,11 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
         stream << ", axis: " << tensor_.q_per_channel_axis();
       }
     }
+
+    auto& fw_grad = tensor.fw_grad(/* level */ 0);
+    if (fw_grad.defined()) {
+      stream << ", tangent:" << std::endl << fw_grad;
+    }
     stream << " ]";
   }
   return stream;
diff --git a/aten/src/ATen/core/NamedRegistrations.cpp b/aten/src/ATen/core/NamedRegistrations.cpp
index c4c1b1ecc9ba..d9a4979ff3c9 100644
--- a/aten/src/ATen/core/NamedRegistrations.cpp
+++ b/aten/src/ATen/core/NamedRegistrations.cpp
@@ -510,4 +510,5 @@ TORCH_LIBRARY_IMPL(aten, Named, m) {
   m.impl("_version", CppFunction::makeFallthrough());
   m.impl("requires_grad_", CppFunction::makeFallthrough());
   m.impl("retain_grad", CppFunction::makeFallthrough());
+  m.impl("_fw_primal", CppFunction::makeFallthrough());
 }
diff --git a/aten/src/ATen/native/AutogradComposite.cpp b/aten/src/ATen/native/AutogradComposite.cpp
new file mode 100644
index 000000000000..be7184a26565
--- /dev/null
+++ b/aten/src/ATen/native/AutogradComposite.cpp
@@ -0,0 +1,27 @@
+#include <ATen/ATen.h>
+
+namespace at {
+namespace native {
+
+/// This function can be used to create a dual Tensor that holds a tangent to compute forward mode gradients.
+/// Note that the dual Tensor's primal is a view of the given primal and the given tangent is used as-is.
+/// This function is backward differentiable.
+at::Tensor make_dual(const at::Tensor& primal, const at::Tensor& tangent, int64_t level) {
+  TORCH_CHECK(!primal.fw_grad(level).defined(), "Making a dual Tensor based on a Tensor that "
+              "already has a forward gradient at the same level ", level, " is not supported.");
+
+  auto dual_tensor = primal.view(primal.sizes());
+  dual_tensor.set_fw_grad(tangent, level, /* is_inplace_op */ false);
+  return dual_tensor;
+}
+
+/// This function can be used to unpack a given dual Tensor to get its primal and tangent. The returned primal
+/// is a view of the dual and the tangent is returned as is.
+/// This function is backward differentiable.
+std::tuple<at::Tensor, at::Tensor> unpack_dual(const at::Tensor& tensor, int64_t level) {
+  return std::tuple<at::Tensor, at::Tensor>(tensor._fw_primal(level), tensor.fw_grad(level));
+}
+
+} // namespace native
+
+} // namespace at
diff --git a/aten/src/ATen/native/VariableMethodStubs.cpp b/aten/src/ATen/native/VariableMethodStubs.cpp
index d547e42156ea..e7d65dc0967d 100644
--- a/aten/src/ATen/native/VariableMethodStubs.cpp
+++ b/aten/src/ATen/native/VariableMethodStubs.cpp
@@ -40,5 +40,9 @@ void retain_grad(Tensor& self) {
   AT_ERROR("retain_grad is not implemented for Tensor");
 }
 
+Tensor _fw_primal(const Tensor& self, int64_t level) {
+  AT_ERROR("_fw_primal is not implemented for Tensor");
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index f29d8efabf52..78ad11229428 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -105,6 +105,20 @@
   manual_kernel_registration: True
   variants: method
 
+- func: _fw_primal(Tensor(a) self, int level) -> Tensor(a)
+  use_c10_dispatcher: full
+  variants: method
+  dispatch:
+    DefaultBackend: _fw_primal
+
+- func: make_dual(Tensor(a) primal, Tensor tangent, int level) -> Tensor(a)
+  use_c10_dispatcher: full
+  variants: function
+
+- func: unpack_dual(Tensor(a) dual, int level) -> (Tensor(a) primal, Tensor tangent)
+  use_c10_dispatcher: full
+  variants: function
+
 - func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: method
diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index 2375f6ceb65c..d42c8c23fe9c 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -599,6 +599,23 @@ class TORCH_API Tensor {
     return impl_->grad();
   }
 
+  // The Forward AD API functions below are low level and are not to be used by end
+  // users who should use the API provided in torch/csrc/autograd.h
+
+  /// This function returns the forward gradient for this Tensor at the given level.
+  const Tensor& fw_grad(uint64_t level) const {
+    return impl_->fw_grad(level, *this);
+  }
+
+  /// This function can be used to set the value of the forward grad.
+  /// Note that the given new_grad might not be used directly if it has different
+  /// metadata (size/stride/storage offset) compared to this Tensor. In that case,
+  /// new_grad content will be copied into a new Tensor
+  void set_fw_grad(const Tensor& new_grad, uint64_t level, bool is_inplace_op) {
+    impl_->set_fw_grad(new_grad, *this, level, is_inplace_op);
+  }
+
+
   // STOP.  Thinking of adding a method here, which only makes use
   // of other ATen methods?  Define it in native_functions.yaml.
 
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index e305f352d7cb..5b4a4f3b83e6 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -44,6 +44,17 @@ const at::Tensor& TensorImpl::grad() const {
   return autograd_meta_->grad();
 }
 
+const at::Tensor& TensorImpl::fw_grad(uint64_t level, const at::Tensor& self) const {
+  // See TensorImpl::grad() above for explanation about the line below
+  if (!autograd_meta_) return impl::GetAutogradMetaFactory()->undefined_tensor();
+  return autograd_meta_->fw_grad(level, self);
+}
+
+void TensorImpl::set_fw_grad(const at::Tensor& new_grad, const at::Tensor& self, uint64_t level, bool is_inplace_op) {
+  if (!autograd_meta_) autograd_meta_ = impl::GetAutogradMetaFactory()->make();
+  autograd_meta_->set_fw_grad(new_grad, self, level, is_inplace_op);
+}
+
 TensorImpl::TensorImpl(
     Storage&& storage,
     DispatchKeySet key_set,
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 5deab2a09832..3326404e1d07 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -136,6 +136,8 @@ struct C10_API AutogradMetaInterface {
   virtual bool requires_grad() const = 0;
   virtual at::Tensor& mutable_grad() = 0;
   virtual const at::Tensor& grad() const = 0;
+  virtual const at::Tensor& fw_grad(uint64_t level, const at::Tensor& self) const = 0;
+  virtual void set_fw_grad(const at::Tensor& new_grad, const at::Tensor& self, uint64_t level, bool is_inplace_op) = 0;
   virtual ~AutogradMetaInterface();
 };
 
@@ -598,6 +600,42 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    */
   const at::Tensor& grad() const;
 
+  /**
+   * Return the accumulated gradient of a tensor. This gradient is computed
+   * using forward mode AD.
+   *
+   * This is an internal API that should never be used by end users.
+   *
+   * The API is as follows:
+   *   - "level" allows to specify the level of forward AD nesting for which the
+   *     gradient should be returned. Note that since levels are not fully
+   *     supported yet, this argument should be 0. See documentation for
+   *     torch::autograd::enter_dual_level for more details about forward AD nesting.
+   *   - "self" should represent the Tensor whose forward grad is accessed. It is
+   *     required when dealing with view.
+   */
+  const at::Tensor& fw_grad(uint64_t level, const at::Tensor& self) const;
+
+  /**
+   * Sets the forward gradient for this Tensor.
+   * The given Tensor might not be used directly and its content will be copied.
+   *
+   * This is an internal API that should never be used by end users.
+   *
+   * The API is as follows:
+   *   - "new_grad" is a Tensor containing the new value of the gradient that should
+   *     be set
+   *   - "self" should reprensent the Tensor whose forward grad is accessed. It is
+   *     required when dealing with view.
+   *   - "level" allows to specify the level of forward AD nesting for which the
+   *     gradient should be set. Note that since levels are not fully supported
+   *     yet, this argument should be 0. See documentation for torch::autograd::enter_dual_level
+   *     for more details about forward AD nesting.
+   *   - "is_inplace_op" is a boolean flag that tells if this gradient was generated
+   *     by an inplace operation or an out of place one. This allows better error checking.
+   */
+  void set_fw_grad(const at::Tensor& new_grad, const at::Tensor& self, uint64_t level, bool is_inplace_op);
+
   /**
    * Return a typed data pointer to the actual data which this tensor refers to.
    * This checks that the requested type (from the template parameter) matches
diff --git a/test/test_autograd.py b/test/test_autograd.py
index d823732c613e..3d29529cab9a 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -35,6 +35,7 @@
                                                   IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck)
 from torch.autograd import Variable, Function, detect_anomaly, kineto_available
 from torch.autograd.function import InplaceFunction
+import torch.autograd.forward_ad as fwAD
 from torch.testing import randn_like
 from torch.testing._internal.common_methods_invocations import (method_tests,
                                                                 create_input, unpack_variables,
@@ -5326,6 +5327,26 @@ def fn(a, dim0_size=5):
 
         self.assertEqual(x.grad, y.grad)
 
+    def test_view_with_multi_output(self):
+        x = torch.randn(2, 2, 2, dtype=torch.double)
+
+        x1 = torch.view_as_complex(x)
+        # Taking an invalid view should always be allowed as long as it is not
+        # modified inplace
+        res = x1.unbind(0)
+
+        with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"):
+            res[0] += torch.rand(2, requires_grad=True)
+
+        x.requires_grad_(True)
+        x1 = torch.view_as_complex(x)
+        # Taking an invalid view should always be allowed as long as it is not
+        # modified inplace
+        res = x1.unbind(0)
+
+        with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"):
+            res[0] += torch.rand(2, requires_grad=True)
+
     def as_identity(self):
         # view_as_real and view_as_complex behavior should be like an identity
         def func(z):
@@ -6324,6 +6345,66 @@ def foo(a):
         self.assertEqual(hvp, torch.mm(hes, v.unsqueeze(1)).squeeze(1))
         self.assertEqual(vhp, torch.mm(v.unsqueeze(0), hes).squeeze(0))
 
+class TestAutogradForwardMode(TestCase):
+    def test_forward_level_cleanup(self):
+        import weakref
+
+        def get_tensor_and_weak_ref():
+            # Helper function to get a Tensor and a weak ref that tells us
+            # if the c++ version of this Tensor is still alive or not.
+            #
+            # Create the following reference chain to do so:
+            #   - python Tensor t
+            #   - c++ Tensor corresponding by t
+            #   - c++ Node corresponding to t.grad_fn
+            #   - python dict of metadata from this Node
+            #   - an object in this dict that we can take a weakref of
+
+
+            # Create a new Tensor and Node
+            t = torch.rand(2, requires_grad=True).clone()
+            # Create the metadata dict
+            meta_dict = t.grad_fn.metadata
+            # Create the object in the dict
+
+            class Foo(object):
+                pass
+            my_obj = Foo()
+            meta_dict[0] = my_obj
+
+            # After exiting this function, the python Tensor t is the only
+            # thing keeping ref alive
+            ref = weakref.ref(my_obj)
+            return t, ref
+
+        # Sanity check that the helper function works as expected
+        t, t_ref = get_tensor_and_weak_ref()
+        self.assertIsNotNone(t_ref())
+
+        del t
+        self.assertIsNone(t_ref())
+
+        # Main test code
+        foo = torch.rand(2)
+
+        with fwAD.dual_level():
+            tangent, tangent_ref = get_tensor_and_weak_ref()
+            self.assertIsNotNone(tangent_ref())
+
+            dual = fwAD.make_dual(foo, tangent)
+            self.assertIsNotNone(tangent_ref())
+
+            # Make sure that the tangent we provided has been re-used as is
+            self.assertTrue(fwAD.unpack_dual(dual)[1] is tangent)
+
+            # Make sure that dual is keeping the tangent alive
+            del tangent
+            self.assertIsNotNone(tangent_ref())
+
+            # Make sure that the dual level does not keep the c++
+            # version of the tangent alive
+            del dual
+            self.assertIsNone(tangent_ref())
 
 # Generic device type autograd tests.
 class TestAutogradDeviceType(TestCase):
diff --git a/test/test_namedtuple_return_api.py b/test/test_namedtuple_return_api.py
index 88a01e48b5f2..bbb69f6e147f 100644
--- a/test/test_namedtuple_return_api.py
+++ b/test/test_namedtuple_return_api.py
@@ -12,7 +12,7 @@
 all_operators_with_namedtuple_return = {
     'max', 'min', 'median', 'nanmedian', 'mode', 'kthvalue', 'svd', 'symeig', 'eig',
     'qr', 'geqrf', 'solve', 'slogdet', 'sort', 'topk', 'lstsq',
-    'triangular_solve', 'cummax', 'cummin', 'linalg_eigh'
+    'triangular_solve', 'cummax', 'cummin', 'linalg_eigh', "unpack_dual"
 }
 
 
@@ -65,6 +65,7 @@ def test_namedtuple_return(self):
             op(operators=['triangular_solve'], input=(a,), names=('solution', 'cloned_coefficient'), hasout=True),
             op(operators=['lstsq'], input=(a,), names=('solution', 'QR'), hasout=True),
             op(operators=['linalg_eigh'], input=("L",), names=('eigenvalues', 'eigenvectors'), hasout=True),
+            op(operators=['unpack_dual'], input=(a, 0), names=('primal', 'tangent'), hasout=False),
         ]
 
         for op in operators:
@@ -75,7 +76,9 @@ def test_namedtuple_return(self):
                     for i, name in enumerate(op.names):
                         self.assertIs(getattr(ret, name), ret[i])
                 else:
-                    ret = getattr(a, f)(*op.input)
+                    # Handle op that are not methods
+                    func = getattr(a, f) if hasattr(a, f) else getattr(torch, f)
+                    ret = func(*op.input)
                     for i, name in enumerate(op.names):
                         self.assertIs(getattr(ret, name), ret[i])
                     if op.hasout:
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index ff5415987af1..1f61ce3dfa20 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -80,7 +80,8 @@
     'nonzero(_(out|numpy))?',
     'set_data',
     '.*_overrideable',  # overrideable functions for backend extension
-    'data', 'is_leaf', 'output_nr', '_version', 'requires_grad_', 'retain_grad', 'set_'
+    'data', 'is_leaf', 'output_nr', '_version', 'requires_grad_', 'retain_grad', 'set_',
+    '_fw_primal'
 ]
 
 # These function signatures are not exposed to Python. Note that this signature
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index 1c8f11eb7a11..b2dfe2667128 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -25,7 +25,7 @@
 # For these ops we want to skip the codegen-ed registration to both Autograd and Tracer keys.
 # You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
 MANUAL_AUTOGRAD_AND_TRACER = set([
-    'resize_', 'resize_as_', 'detach', 'detach_', 'copy_',
+    'resize_', 'resize_as_', 'detach', 'detach_', 'copy_', '_fw_primal',
 ])
 
 # Currently MANUAL_AUTOGRAD and MANUAL_TRACER share the same set of ops:
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 071575b4fd7b..03fbf34034ea 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -689,7 +689,7 @@ def wrap_output(return_values, var):
 
             if len(differentiable_output_vars) == 0:
                 # no output is differentiable (.indices() for SparseTensors for example)
-                rhs_value = 'as_view({}, {}, /* is_differentiable */ false)'.format(view_info, var)
+                rhs_value = f'as_view({view_info}, {var}, /* is_bw_differentiable */ false, /* is_fw_differentiable */ false)'
             elif len(differentiable_output_vars) == 1:
                 # Single differentiable output (Tensor or Tensor[])
                 return_info = differentiable_outputs[0]
@@ -704,13 +704,15 @@ def wrap_output(return_values, var):
                         creation_meta = "CreationMeta::MULTI_OUTPUT_SAFE"
                     else:
                         creation_meta = "CreationMeta::MULTI_OUTPUT_NODE"
-                    call += ("as_view(/* base */ {}, /* output */ {}, /* is_differentiable */ true, "
-                             "/* creation_meta */ {});\n").format(view_info, var, creation_meta)
+                    call += ("as_view(/* base */ {}, /* output */ {}, /* is_bw_differentiable */ true, "
+                             "/* is_fw_differentiable */ true, "
+                             "/* creation_meta */ {});").format(view_info, var, creation_meta)
                     rhs_value = 'std::move({})'.format(var)
                 else:
                     call += emit_view_lambda()
                     creation_meta = "GradMode::is_enabled() ? CreationMeta::DEFAULT: CreationMeta::NO_GRAD_MODE"
-                    rhs_value = ("as_view(/* base */ {}, /* output */ {}, /* is_differentiable */ true, "
+                    rhs_value = ("as_view(/* base */ {}, /* output */ {}, /* is_bw_differentiable */ true, "
+                                 "/* is_fw_differentiable */ true, "
                                  "/* view_func */ func, /* creation_meta */ {})").format(view_info, var, creation_meta)
             else:
                 # This could be supported but we don't need it at the moment, so keeping things simple.
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 60636277a49b..a214684ab29c 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -90,6 +90,8 @@ core_sources_common = [
     "torch/csrc/autograd/profiler_legacy.cpp",
     "torch/csrc/autograd/profiler_kineto.cpp",
     "torch/csrc/autograd/profiler_utils.cpp",
+    "torch/csrc/autograd/autograd_meta.cpp",
+    "torch/csrc/autograd/forward_grad.cpp",
     "torch/csrc/jit/frontend/edit_distance.cpp",
     "torch/csrc/jit/frontend/string_to_type.cpp",
     "torch/csrc/jit/mobile/type_parser.cpp",
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 2a31552068a1..79c93cb191f1 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -522,6 +522,12 @@ def autocast_increment_nesting() -> _int: ...
 def autocast_decrement_nesting() -> _int: ...
 def set_anomaly_enabled(enabled: _bool) -> None: ...
 def is_anomaly_enabled() -> _bool: ...
+def _enter_dual_level() -> _int: ...
+def _exit_dual_level(level: _int) -> None: ...
+def _make_dual(tensor: Tensor, tangent: Tensor, level: _int) -> Tensor: ...
+def _unpack_dual(tensor: Tensor, level: _int) -> Tensor: ...
+def __set_forward_AD_enabled(enabled: _bool) -> None: ...
+def __is_forward_AD_enabled() -> _bool: ...
 
 # Defined in torch/csrc/jit/python/script_init.cpp
 class LoggerBase(object):
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index 5945713934ba..1aef783ee66f 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -275,11 +275,16 @@ def get_summarized_data(self):
     else:
         return torch.stack([get_summarized_data(x) for x in self])
 
-def _str_intern(self):
+def _str_intern(inp):
     prefix = 'tensor('
     indent = len(prefix)
     suffixes = []
 
+    # This is used to extract the primal value and thus disable the forward AD
+    # within this function.
+    # TODO(albanD) This needs to be updated when more than one level is supported
+    self, tangent = torch.autograd.forward_ad.unpack_dual(inp)
+
     # Note [Print tensor device]:
     # A general logic here is we only print device when it doesn't match
     # the device specified in default tensor type.
@@ -355,17 +360,22 @@ def _str_intern(self):
     if self.layout != torch.strided:
         suffixes.append('layout=' + str(self.layout))
 
-    if self.grad_fn is not None:
-        name = type(self.grad_fn).__name__
+    # Use inp here to get the original grad_fn and not the one generated by the forward grad
+    # unpacking.
+    if inp.grad_fn is not None:
+        name = type(inp.grad_fn).__name__
         if name == 'CppFunction':
-            name = self.grad_fn.name().rsplit('::', 1)[-1]
+            name = inp.grad_fn.name().rsplit('::', 1)[-1]
         suffixes.append('grad_fn=<{}>'.format(name))
-    elif self.requires_grad:
+    elif inp.requires_grad:
         suffixes.append('requires_grad=True')
 
     if self.has_names():
         suffixes.append('names={}'.format(self.names))
 
+    if tangent is not None:
+        suffixes.append('tangent={}'.format(tangent))
+
     return _add_suffixes(prefix + tensor_str, suffixes, indent, force_newline=self.is_sparse)
 
 def _str(self):
diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index 380b24edfaab..0bcf70e10e61 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -19,6 +19,7 @@
 from .anomaly_mode import detect_anomaly, set_detect_anomaly
 from ..overrides import has_torch_function, handle_torch_function
 from . import functional
+from . import forward_ad
 
 __all__ = ['Variable', 'Function', 'backward', 'grad_mode']
 
diff --git a/torch/autograd/forward_ad.py b/torch/autograd/forward_ad.py
new file mode 100644
index 000000000000..3ad989621c88
--- /dev/null
+++ b/torch/autograd/forward_ad.py
@@ -0,0 +1,116 @@
+import torch
+from .grad_mode import _DecoratorContextManager
+
+from typing import Any
+
+# TODO(alband): Once most of the formulas are implemented, these functions need to be added
+# to the main doc to make them fully "public".
+
+# Global variable used to make the python API simpler to use
+_current_level = -1
+
+def enter_dual_level():
+    r"""Function that can be used to enter a new forward grad level.
+    This level can be used to make and unpack dual Tensors to compute
+    forward gradients.
+
+    This function also updates the current level that is used by default
+    by the other functions in this API.
+    """
+    global _current_level
+    new_level = torch._C._enter_dual_level()
+    if new_level != _current_level + 1:
+        raise RuntimeError("Entering a new forward AD level but the current level "
+                           "is not valid. Make sure you did not modified it directly.")
+    _current_level = new_level
+    return new_level
+
+def exit_dual_level(*, level=None):
+    r"""Function that can be used to exit a forward grad level.
+    This function deletes all the gradients associated with this
+    level. Only deleting the latest entered level is allowed.
+
+    This function also updates the current level that is used by default
+    by the other functions in this API.
+    """
+    global _current_level
+    if level is None:
+        level = _current_level
+    if level != _current_level:
+        raise RuntimeError("Trying to exit a forward AD level that was not the last one "
+                           "that was created. This is not supported.")
+    torch._C._exit_dual_level(level=level)
+    _current_level = level - 1
+
+def make_dual(tensor, tangent, *, level=None):
+    r"""Function that creates a "dual object" that can be used to compute forward AD gradients
+    based on the given Tensor and its tangent. It returns a new Tensor that shares memory with
+    :attr:`tensor` and the :attr:`tangent` is used as-is.
+
+    This function is backward differentiable.
+
+    Given a function `f` whose jacobian is `J`, it allows to compute the jacobian vector product,
+    named `jvp`, between `J` and a given vector `v` as follows.
+
+    Example::
+        >>> inp = make_dual(x, v)
+        >>> out = f(inp)
+        >>> y, jvp = unpack_dual(out)
+
+    """
+    if level is None:
+        level = _current_level
+
+    if level < 0:
+        raise RuntimeError("Trying to create a dual Tensor for forward AD but no level "
+                           "exists, make sure to enter_dual_level() first.")
+
+    return torch.make_dual(tensor, tangent, level=level)
+
+def unpack_dual(tensor, *, level=None):
+    r"""Function that unpacks a "dual object" to recover two plain tensors, one representing
+    the primal and the other the tangent (both are views of :attr:`tensor`. Neither of these
+    tensors can be dual tensor of level :attr:`level`.
+
+    This function is backward differentiable.
+    """
+    if level is None:
+        level = _current_level
+
+    if level < 0:
+        return tensor, None
+
+    return torch.unpack_dual(tensor, level=level)
+
+class dual_level(_DecoratorContextManager):
+    r"""Context-manager that controls the current forward ad level. It
+    appropriately enters and exit the dual level.
+
+    This function also updates the current level that is used by default
+    by the other functions in this API.
+
+    Example::
+
+        >>> x = torch.tensor([1])
+        >>> x_t = torch.tensor([1])
+        >>> with dual_level():
+        ...   inp = make_dual(x, x_t)
+        ...   # Do computations with inp
+        ...   out = your_fn(inp)
+        ...   _, grad = unpack_dual(out)
+        >>> grad is None
+        False
+        >>> # After exiting the level, the grad is deleted
+        >>> _, grad_after = unpack_dual(out)
+        >>> grad is None
+        True
+
+    """
+    def __init__(self):
+        super().__init__()
+
+    def __enter__(self):
+        return enter_dual_level()
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        exit_dual_level()
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index ccc6724b69c7..891d66e8f282 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -35,10 +35,22 @@ bool isDefined(const c10::optional<Tensor>& t) {
   return t.has_value() && t->defined();
 }
 
+bool isFwGradDefined(const c10::optional<Tensor>& t) {
+  return t.has_value() && t->defined() && t->fw_grad(/*level */ 0).defined();
+}
+
 Tensor toLegacyTensor(const c10::optional<Tensor>& t) {
   return t.has_value() ? *t : Tensor();
 }
 
+Tensor toLegacyFwGrad(const c10::optional<Tensor>& t) {
+  return (t.has_value() && t->defined()) ? t->fw_grad(/*level */ 0) : Tensor();
+}
+
+Tensor toLegacyPrimal(const c10::optional<Tensor>& t) {
+  return (t.has_value() && t->defined()) ? t->_fw_primal(/*level */ 0) : Tensor();
+}
+
 void copy_range(variable_list& out, IndexRange range, const Tensor & t) {
   AT_ASSERT(range.second <= out.size());
   AT_ASSERTM(range.second - range.first == 1, "inconsistent range for Tensor output");
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index b56d89c71657..73bd02960d7f 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -29,6 +29,10 @@ struct IndexRangeGenerator {
     size_t i = 0;
 };
 
+bool isFwGradDefined(const c10::optional<Tensor>& t);
+Tensor toLegacyFwGrad(const c10::optional<Tensor>& t);
+Tensor toLegacyPrimal(const c10::optional<Tensor>& t);
+
 bool any_variable_defined(variable_list& variables);
 void copy_range(variable_list& out, IndexRange range, const at::Tensor & t);
 void copy_range(variable_list& out, IndexRange range, at::ArrayRef<at::Tensor> t);
diff --git a/torch/csrc/autograd/TraceTypeManual.cpp b/torch/csrc/autograd/TraceTypeManual.cpp
index 7e1f762a96a9..148725eecdea 100644
--- a/torch/csrc/autograd/TraceTypeManual.cpp
+++ b/torch/csrc/autograd/TraceTypeManual.cpp
@@ -139,6 +139,7 @@ TORCH_LIBRARY_IMPL(aten, Tracer, m) {
   m.impl("_version", CppFunction::makeFallthrough());
   m.impl("requires_grad_", CppFunction::makeFallthrough());
   m.impl("retain_grad", CppFunction::makeFallthrough());
+  m.impl("_fw_primal", CppFunction::makeFallthrough());
 }
 
 }  // namespace
diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
index e64c9f816a1c..0663d7f46fa8 100644
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -1,6 +1,7 @@
 #include <c10/util/Optional.h>
 #include <c10/core/ScalarType.h>
 #include <torch/csrc/autograd/VariableTypeUtils.h>
+#include <torch/csrc/autograd/FunctionsManual.h>
 #include <torch/csrc/utils/memory.h>
 #include <torch/csrc/autograd/utils/error_messages.h>
 #include <torch/csrc/autograd/autograd.h>
@@ -194,6 +195,39 @@ void retain_grad(Tensor & self) {
   impl::get_autograd_meta(self)->retains_grad_ = true;
 }
 
+// Taken from codegened version
+Tensor _fw_primal(const Tensor & self, int64_t level) {
+  auto& self_ = unpack(self, "self", 0);
+  std::shared_ptr<Identity> grad_fn;
+  if (compute_requires_grad( self )) {
+    grad_fn = std::make_shared<Identity>();
+    grad_fn->set_next_edges(collect_next_edges( self ));
+  }
+  auto tmp = ([&]() {
+    at::AutoNonVariableTypeMode non_var_type_mode(true);
+    return self_.alias();
+  })();
+  c10::optional<std::function<at::Tensor(const at::Tensor&)>> func=c10::nullopt;
+  if (!self.unsafeGetTensorImpl()->support_as_strided()) {
+    auto size_vec = self.sizes().vec();
+    func = [=](const at::Tensor& input_base) {
+      return input_base.view(size_vec);
+    };
+  }
+  auto result = as_view(/* base */ self, /* output */ tmp, /* is_bw_differentiable */ true,
+                        /* is_fw_differentiable */ false, /* view_func */ func, /* creation_meta */ CreationMeta::DEFAULT);
+  if (grad_fn) {
+      set_history(flatten_tensor_args( result ), grad_fn);
+  }
+  if (generated::details::isFwGradDefined(self)) {
+    // Modified from original codegen
+    // We explicitly want to ignore the forward grad at the given level
+    TORCH_CHECK(level == 0, "Invalid level given to _fw_primal");
+    // End modified from original codegen
+  }
+  return result;
+}
+
 // We don't have an outplace copy, so this can't be generated automatically
 Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking) {
   jit::Value* output = nullptr;
@@ -217,6 +251,24 @@ Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking) {
   }
   increment_version(self);
   rebase_history(self , std::move(grad_fn));
+
+  if (isDifferentiableType(self.scalar_type()) &&
+      (generated::details::isFwGradDefined(self) || generated::details::isFwGradDefined(src))) {
+    auto self_fw_grad = generated::details::toLegacyFwGrad(self);
+    auto src_fw_grad = generated::details::toLegacyFwGrad(src);
+    Tensor new_fw_grad;
+    if (self_fw_grad.defined()) {
+      if (src_fw_grad.defined()) {
+        new_fw_grad = self_fw_grad.copy_(src_fw_grad);
+      } else {
+        new_fw_grad = self_fw_grad.fill_(0);
+      }
+    } else {
+      new_fw_grad = src_fw_grad;
+    }
+    self.set_fw_grad(new_fw_grad, /* level */ 0, /* is_inplace_op */ true);
+  }
+
   return self;
 }
 
@@ -232,6 +284,11 @@ Tensor& resize_(
     at::AutoNonVariableTypeMode non_var_type_mode(true);
     self_.resize_(size, std::move(optional_memory_format));
   }
+
+  if (self.fw_grad(/* level */ 0).defined()) {
+    AT_ERROR("cannot resize variables that has a forward grad");
+  }
+
   return self;
 }
 
@@ -248,13 +305,28 @@ Tensor& resize_as_(
     at::AutoNonVariableTypeMode non_var_type_mode(true);
     at::resize_as_(self_, the_template_, std::move(optional_memory_format));
   }
+
+  // Handle fw grad
+  if (self.fw_grad(/* level */ 0).defined()) {
+    AT_ERROR("cannot resize variables that has a forward grad");
+  }
   return self;
 }
 
 Tensor detach(const Tensor & self) {
   RECORD_FUNCTION("detach", std::vector<c10::IValue>({self}));
-  auto result = make_variable_non_differentiable_view(self, self, /*allow_tensor_metadata_change=*/false);
+  c10::optional<std::function<at::Tensor(const at::Tensor&)>> func=c10::nullopt;
+  auto result = as_view(/* base */ self, /* output */ self, /* is_bw_differentiable */ false,
+                        /* is_fw_differentiable */ true, /* view_func */ func, /* creation_meta */ CreationMeta::DEFAULT,
+                        /*allow_tensor_metadata_change=*/false);
   namedinference::propagate_names(result, self);
+
+  // detach only backward gradients for both primal and tangent
+  if (self.fw_grad(/* level */ 0).defined()) {
+    auto new_fw_grad = self.fw_grad(/* level */ 0).detach();
+    result.set_fw_grad(new_fw_grad, /* level */ 0, /* is_inplace_op */ false);
+  }
+
   return result;
 }
 
@@ -264,7 +336,7 @@ Tensor & detach_(Tensor & self) {
     // NB: is_view() ==> get_autograd_meta()
     auto diff_view_meta = static_cast<torch::autograd::DifferentiableViewMeta*>(torch::autograd::impl::get_autograd_meta(self));
     // See NOTE [ View + Inplace detection ]
-    if (diff_view_meta->creation_meta == CreationMeta::MULTI_OUTPUT_SAFE) {
+    if (diff_view_meta->get_creation_meta() == CreationMeta::MULTI_OUTPUT_SAFE) {
         TORCH_WARN("This view is an output of a function that "
                    "returns multiple views. Detaching such views inplace "
                    "is being deprecated and will be forbidden "
@@ -272,7 +344,8 @@ Tensor & detach_(Tensor & self) {
                    "of detach_(). Alternatively, create this view with an "
                    "`unsafe_` version of the function that produced it.");
     } else {
-      AT_ERROR("If you are using DistributedDataParallel (DDP) for training, "
+      AT_ERROR("Can't detach views in-place. Use detach() instead. "
+               "If you are using DistributedDataParallel (DDP) for training, "
                "and gradient_as_bucket_view is set as True, gradients are "
                "views of DDP buckets, and hence detach_() cannot be called "
                "on these gradients. To fix this error, please refer to the "
@@ -290,6 +363,12 @@ Tensor & detach_(Tensor & self) {
   autograd_meta->set_requires_grad(false, self.unsafeGetTensorImpl());
   autograd_meta->grad_fn_.reset();
   autograd_meta->output_nr_ = 0;
+
+  // detach only backward gradients for both primal and tangent
+  if (self.fw_grad(/* level */ 0).defined()) {
+    self.fw_grad(/* level */ 0).detach_();
+  }
+
   return self;
 }
 
@@ -321,6 +400,7 @@ TORCH_LIBRARY_IMPL(aten, Autograd, m) {
   //      and requires_grad_(), then remove the backend Autograd kernel here, only leaving the Math kernel.
   m.impl("_backward", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::_backward)));
   m.impl("requires_grad_", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::requires_grad_)));
+  m.impl("_fw_primal", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::_fw_primal)));
 }
 
 TORCH_LIBRARY_IMPL(aten, DefaultBackend, m) {
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index e67815e5609a..af02de68fc27 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -134,88 +134,111 @@ template<typename... Args> inline variable_list flatten_tensor_args(Args&&... ar
 }
 
 // See NOTE [ Autograd View Variables ] for details.
-inline Tensor as_view(const Tensor & base, const Tensor& tensor, bool is_differentiable,
-        c10::optional<std::function<Tensor(const Tensor&)>> view_func=c10::nullopt,
-        CreationMeta creation_meta=CreationMeta::DEFAULT) {
-  auto base_var = Variable(base);
-  if (base_var.is_view()) {
-    // Set `view_func` using the root base as input.
-    // `view_func` is used to recover views in backward when either as_strided is not supported
-    // or the view function changes the metadata which is not recorded by as_strided
-    // See Note [View + Inplace update on base tensor] and [View + Inplace update on view tensor]
-    // for more details how we use this function in backward.
-    auto diff_view_meta = static_cast<DifferentiableViewMeta*>(torch::autograd::impl::get_autograd_meta(base_var));
-    if (view_func.has_value()) {
-      auto fn = view_func.value();
-      // both current_view and it's parent have a view_func
-      if (diff_view_meta->has_view_fn()) {
-        auto prev_fn = diff_view_meta->view_fn();
-        view_func = [=](const at::Tensor& root_base) {
-          auto temp = prev_fn(root_base);
-          return fn(temp);
-        };
+inline Tensor as_view(const Tensor & base, const Tensor & tensor, bool is_bw_differentiable,
+        bool is_fw_differentiable, c10::optional<std::function<Tensor(const Tensor&)>> view_func=c10::nullopt,
+        CreationMeta creation_meta=CreationMeta::DEFAULT, bool allow_tensor_metadata_change=true) {
+  if (!isForwardADEnabled()) {
+    // Fast codepath for backward only code
+    // It is useful as it avoids the creation of the temporary c10<optional> which makes
+    // a significant difference when measuring instruction count for a single "t.view(-1)" call from c++.
+    if (is_bw_differentiable) {
+      if (base.is_view()) {
+        auto diff_view_meta = static_cast<DifferentiableViewMeta*>(torch::autograd::impl::get_autograd_meta(base));
+        const auto& base_bw_info = diff_view_meta->get_backward_view();
+        return make_variable_differentiable_view(tensor, base_bw_info.chain(base, tensor, view_func),
+                                                 c10::nullopt, creation_meta, allow_tensor_metadata_change);
       } else {
-        // current_view has a view_func and but it's parent doesn't have one
-        if(base_var.unsafeGetTensorImpl()->support_as_strided()) {
-          auto size = base.sizes().vec();
-          auto stride = base.strides().vec();
-          auto storage_offset = base.storage_offset();
-          view_func = [=](const at::Tensor& root_base) {
-            auto temp = root_base.as_strided(size, stride, storage_offset);
-            return fn(temp);
-          };
-        } else {
-          // When base_var is a view but doesn't carry a view_fn in DifferentiableViewMeta, it's
-          // a view that doesn't support inplace update, e.g. unbind.
-          // In this case we should throw an error when inplace update happens in **forward**.
-          // One would naturally think the following function will be first called in backward pass.
-          // But the first call site is indeed in **forward** pass when we refresh `grad_fn`
-          // triggered by inplace update.
-          // Search Note [View + Inplace update for view tensor] to for the call site.
-          view_func = [=](const at::Tensor& root_base) {
-            TORCH_CHECK(false, "This view is the output of a function that returns multiple views."
-                    "Such functions do not allow the output views to be modified inplace."
-                    "You should replace the inplace operation by an out-of-place one");
-            return root_base;
-          };
-        }
+        return make_variable_differentiable_view(tensor, ViewInfo(base, view_func),
+                                                 c10::nullopt, creation_meta, allow_tensor_metadata_change);
       }
-    } else if(diff_view_meta->has_view_fn()) {
-      // if current_view doesn't have a view_func but it's parent has one
-      auto prev_view_fn = diff_view_meta->view_fn();
-      auto size = tensor.sizes().vec();
-      auto stride = tensor.strides().vec();
-      auto storage_offset = tensor.storage_offset();
-      view_func = [=](const at::Tensor& root_base) {
-        auto temp = prev_view_fn(root_base);
-        return temp.as_strided(size, stride, storage_offset);
-      };
+    } else {
+      TORCH_CHECK(creation_meta == CreationMeta::DEFAULT,
+                  "Non-backward differentiable views must have creation_meta=CreationMeta::DEFAULT");
+      return make_variable_non_differentiable_view(base, std::move(tensor), allow_tensor_metadata_change);
     }
-    base_var = base_var._base();
   }
-  if (is_differentiable) {
-    return make_variable_differentiable_view(std::move(base_var), tensor, creation_meta, std::move(view_func));
+  // Create both the forward and backward info that are needed
+  c10::optional<ViewInfo> new_bw_info;
+  c10::optional<ViewInfo> new_fw_info;
+
+  if (is_bw_differentiable) {
+    if (base.is_view()) {
+      auto diff_view_meta = static_cast<DifferentiableViewMeta*>(torch::autograd::impl::get_autograd_meta(base));
+      const auto& base_bw_info = diff_view_meta->get_backward_view();
+      new_bw_info = base_bw_info.chain(base, tensor, view_func);
+    } else {
+      new_bw_info = ViewInfo(base, view_func);
+    }
   } else {
     TORCH_CHECK(creation_meta == CreationMeta::DEFAULT,
-                "Non-differentiable views must have creation_meta=CreationMeta::DEFAULT");
-    return make_variable_non_differentiable_view(std::move(base_var), tensor);
+                "Non-backward differentiable views must have creation_meta=CreationMeta::DEFAULT");
+  }
+
+  if (is_fw_differentiable) {
+    // Check if base is a forward differentiable view
+    auto base_meta = torch::autograd::impl::get_autograd_meta(base);
+    auto is_view = base_meta && base_meta->is_view_;
+    if (is_view && static_cast<DifferentiableViewMeta*>(base_meta)->has_fw_view()) {
+      auto diff_view_meta = static_cast<DifferentiableViewMeta*>(base_meta);
+      const auto& base_fw_info = diff_view_meta->get_forward_view();
+      new_fw_info = base_fw_info.chain(base, tensor, view_func);
+    } else {
+      new_fw_info = ViewInfo(base, view_func);
+    }
+  }
+
+  if (is_fw_differentiable || is_bw_differentiable) {
+    return make_variable_differentiable_view(tensor, std::move(new_bw_info), std::move(new_fw_info),
+                                             creation_meta, allow_tensor_metadata_change);
+  } else {
+    return make_variable_non_differentiable_view(base, tensor, allow_tensor_metadata_change);
   }
 }
 
 // See NOTE [ Autograd View Variables ] for details.
-inline std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor>& tensors, bool is_differentiable,
-                                   CreationMeta creation_meta=CreationMeta::DEFAULT) {
-  auto base_var = Variable(base);
-  if (base_var.is_view()) {
-    base_var = base_var._base();
+inline std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor>& tensors, bool is_bw_differentiable,
+                                   bool is_fw_differentiable, CreationMeta creation_meta=CreationMeta::DEFAULT) {
+  c10::optional<ViewInfo> new_bw_info = c10::nullopt;
+  c10::optional<ViewInfo> new_fw_info = c10::nullopt;
+
+  if (is_bw_differentiable) {
+    if (base.is_view()) {
+      auto diff_view_meta = static_cast<DifferentiableViewMeta*>(torch::autograd::impl::get_autograd_meta(base));
+      const auto& base_bw_info = diff_view_meta->get_backward_view();
+      TORCH_INTERNAL_ASSERT(creation_meta == CreationMeta::MULTI_OUTPUT_NODE || creation_meta == CreationMeta::MULTI_OUTPUT_SAFE,
+                            "Functions that result multiple view must have a creation meta reflecting this behavior.");
+      // It is ok to create a ViewInfo where only the base is correct in this case as inplace operations on such views are
+      // not allowed
+      new_bw_info = ViewInfo(base_bw_info.base_, /* view_func */ c10::nullopt);
+    } else {
+      new_bw_info = ViewInfo(base, /* view_func */ c10::nullopt);
+    }
+  } else {
+    TORCH_CHECK(creation_meta == CreationMeta::DEFAULT,
+                "Non-backward differentiable views must have creation_meta=CreationMeta::DEFAULT");
   }
+  if (isForwardADEnabled() && is_fw_differentiable) {
+    // Check if base is a forward differentiabble view
+    auto base_meta = torch::autograd::impl::get_autograd_meta(base);
+    auto is_view = base_meta && base_meta->is_view_;
+    if (is_view && static_cast<DifferentiableViewMeta*>(base_meta)->has_fw_view()) {
+      auto diff_view_meta = static_cast<DifferentiableViewMeta*>(base_meta);
+      const auto& base_fw_info = diff_view_meta->get_forward_view();
+      TORCH_INTERNAL_ASSERT(creation_meta == CreationMeta::MULTI_OUTPUT_NODE || creation_meta == CreationMeta::MULTI_OUTPUT_SAFE,
+                            "Functions that result multiple view must have a creation meta reflecting this behavior.");
+      // It is ok to create a ViewInfo where only the base is correct in this case as inplace operations on such views are
+      // not allowed
+      new_fw_info = ViewInfo(base_fw_info.base_, /* view_func */ c10::nullopt);
+    } else {
+      new_fw_info = ViewInfo(base, /* view_func */ c10::nullopt);
+    }
+  }
+
   for(Tensor &tensor : tensors) {
-    if (is_differentiable) {
-      tensor = make_variable_differentiable_view(base_var, tensor, creation_meta);
+    if (is_fw_differentiable || is_bw_differentiable) {
+      tensor = make_variable_differentiable_view(tensor, new_bw_info, new_fw_info, creation_meta);
     } else {
-      TORCH_CHECK(creation_meta == CreationMeta::DEFAULT,
-                  "Non-differentiable views must have creation_meta=CreationMeta::DEFAULT");
-      tensor = make_variable_non_differentiable_view(base_var, tensor);
+      tensor = make_variable_non_differentiable_view(base, tensor);
     }
   }
   return tensors;
diff --git a/torch/csrc/autograd/autograd.cpp b/torch/csrc/autograd/autograd.cpp
index 858b329979ef..e1e70586a079 100644
--- a/torch/csrc/autograd/autograd.cpp
+++ b/torch/csrc/autograd/autograd.cpp
@@ -155,5 +155,18 @@ variable_list grad(
     outputs, gradients, retain_graph.value(), create_graph, inputs, allow_unused, /*accumulate_grad=*/false);
 }
 
+
+namespace forward_ad {
+
+uint64_t enter_dual_level() {
+  return ForwardADLevel::get_next_idx();
+}
+
+void exit_dual_level(uint64_t level) {
+  ForwardADLevel::release_idx(level);
+}
+
+} // namespace forward_ad
+
 } // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/autograd/autograd.h b/torch/csrc/autograd/autograd.h
index c1e788fd6131..7f905b21c3b6 100644
--- a/torch/csrc/autograd/autograd.h
+++ b/torch/csrc/autograd/autograd.h
@@ -75,5 +75,20 @@ TORCH_API variable_list grad(
     bool create_graph = false,
     bool allow_unused = false);
 
+namespace forward_ad {
+
+/// Creates a new dual level and returns its index. This level index should then be used to call
+/// into the other functions below.
+/// This API supports entering a new level before the previous one is exited. We call them nested
+/// forward AD levels. These can be used to compute higher order derivatives.
+TORCH_API uint64_t enter_dual_level();
+
+/// Exits the given level. This will clear up all the gradients from this level and all dual Tensors
+/// that had gradients for this level will become regular Tensors again.
+/// This function can only be used to exit the innermost nesting level and so exiting must happen in
+/// reverse order compared to the entering that was done with the function above.
+TORCH_API void exit_dual_level(uint64_t level);
+
+} // namespace forward_ad
 } // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/autograd/autograd_meta.cpp b/torch/csrc/autograd/autograd_meta.cpp
new file mode 100644
index 000000000000..b06b0ff08c88
--- /dev/null
+++ b/torch/csrc/autograd/autograd_meta.cpp
@@ -0,0 +1,218 @@
+#include <torch/csrc/autograd/variable.h>
+
+namespace torch {
+namespace autograd {
+
+using at::Tensor;
+
+// [Forward Grad View/inplace]
+// It is important to us to allow view and inplace to work with dual Tensors. These operations
+// should either compute the right gradient or raise a user-friendly error.
+
+// The basic case where all Tensors are dual Tensors is as follows:
+//     # Have:
+//     #   foo is a dual Tensor that is not a view
+//     #   bar is a dual Tensor of appropriate size (depending on cases) that is not a view
+//
+//     # Case 1: no view
+//     foo.copy_(bar)
+//
+//     # Case 2: with view, propagate from view to base
+//     view = foo[0]
+//     view.copy_(bar)
+//
+//     # Case 3: with view, propagate from base to view
+//     view = foo[0]
+//     foo.copy_(bar)
+//
+//     # In both cases, the forward grad of foo must be properly updated.
+//     # In the second and third cases, the forward grad of view must match
+//     # the one of foo for the subset they have in common.
+//
+// All these cases can be handled by the following layout constraint on the forward grad:
+//   - A Tensor and its forward grad (for all levels) must have the same metadata (size, stride
+//     and storage offset). Storage offset must be in this metadata because of as_strided.
+//   - View operations must create a forward grad that is a view of the base's forward grad.
+//   - Inplace operations must modify the input's forward grad inplace.
+//
+// This layout constraint is ensured in the `set_fw_grad` function below
+
+
+// More complex cases arrise when non-dual Tensor interact with dual Tensors.
+// The two most important cases are:
+//
+//     # Have:
+//     #   foo is a regular Tensor that is not a view
+//     #   bar is a dual Tensor of appropriate size (depending on cases) that is not a view
+//
+//     # Case 4: Changes on the view must propagate to its base
+//     view = foo[0]
+//     # view is still a regular Tensor here
+//     view.copy_(bar)
+//     # Now both view and foo are dual Tensor with appropriate forward grad
+//
+//     # Case 5: Changes on the base must propagate on all its views
+//     view = foo[0]
+//     # view is still a regular Tensor here
+//     base.copy_(bar)
+//     # Now both view and foo are dual Tensor with appropriate forward grad
+//
+//     # NB there is a case 6 involving changes on a view propagating to other views
+//     # but it is fully described by the two others and is skipped in this discussion.
+//
+// Case 4 is handled by set_fw_grad by properly setting the forward grad of the base if needed.
+// Case 5 is handled in fw_grad by reading the forward grad from the base if needed.
+
+
+namespace {
+  // Check if two Tensor have the same storage offset, sizes and strides
+  bool has_same_meta(const Variable& base, const Variable& other) {
+    if (!base.defined() || !other.defined()) {
+      return false;
+    }
+    if (base.storage_offset() != other.storage_offset()) {
+      return false;
+    }
+    if (base.dim() != other.dim()) {
+      return false;
+    }
+    for (int64_t i=0; i<base.dim(); ++i) {
+      if (base.sizes()[i] != other.sizes()[i]) {
+        return false;
+      }
+      if (base.strides()[i] != other.strides()[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  Tensor new_with_same_meta(const Variable& base) {
+    // We need to create a storage of the same size to be able to have the same
+    // viewing behavior in all cases
+    // Explicit type here to appease Windows build
+    int64_t nelement_in_storage = base.storage().nbytes() / base.itemsize();
+    auto new_tensor = at::zeros({nelement_in_storage}, base.options());
+    auto res = new_tensor.as_strided(base.sizes(), base.strides(), base.storage_offset());
+    return res;
+  }
+} // anonymous namespace
+
+// This function is will ensure that the fw_grad_ is properly a view of the base for inplace ops on
+// Tensors that do not have forward grad originally.
+void AutogradMeta::set_fw_grad(const Variable& new_grad_, const Variable& self, uint64_t level, bool is_inplace_op) {
+  // Lazy initialization
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!fw_grad_) {
+      fw_grad_ = std::make_shared<ForwardGrad>();
+    }
+  }
+  if (fw_grad_->contains(level)) {
+    // Setting the forward grad again is only allowed if it is a no-op.
+    // We do allow this case to simplify writing codegen for inplace ops.
+    TORCH_INTERNAL_ASSERT(new_grad_.defined(), "Cannot set a forward grad that is an undefined Tensor. Use "
+                          "_fw_primal(level) to get a new Tensor with this forward grad unset.");
+
+    TORCH_INTERNAL_ASSERT(is_inplace_op, "Only inplace operations can re-set the forward grad of a Tensor that "
+                          "already has one.");
+
+    TORCH_INTERNAL_ASSERT(fw_grad_->value(level).is_same(new_grad_), "Cannot set a value of a forward grad if it "
+                          "already exists. Inplace operations should modify it inplace.");
+  } else {
+    // TODO(alband) remove this spurious version counter bump
+    auto new_grad = new_grad_;
+
+    if (is_inplace_op && is_view_) {
+      auto this_view_meta = static_cast<DifferentiableViewMeta*>(this);
+
+      // For inplace ops on a Tensor that does not already have a forward grad and is a view, we propagate
+      // the tangent to the base and ensure that the new_grad is a view of that base's tangent.
+      // This ensure that case 4 from [Forward Grad View/inplace] above works fine
+      // What happens in this long if statement is:
+      //   - Check if the base already has a grad
+      //   - If not, set a new fw_grad for it full of zeros
+      //   - Take a view of the base's forward grad
+      //   - Copy the given new_grad into this view
+      //   - Use this view as the new new_grad
+      if (this_view_meta->has_fw_view()) {
+        auto view_info = this_view_meta->get_forward_view();
+        auto& base = view_info.base_;
+
+        if (!base.fw_grad(level).defined()) {
+          // Enforce same meta here to make sure that the view op below is always valid
+          Tensor new_base_fw_grad;
+          if (has_same_meta(new_grad, base)) {
+            // TODO extend this special case to when the underlying storage of new_grad
+            // can be re-used.
+            new_base_fw_grad = new_grad;
+          } else {
+            new_base_fw_grad = new_with_same_meta(base);
+
+            // Update new_grad to be a view of the base
+            Tensor new_fw_grad_value;
+            if (view_info.has_view_fn()) {
+              new_fw_grad_value = view_info.view_fn()(new_base_fw_grad);
+            } else {
+              new_fw_grad_value = new_base_fw_grad.as_strided(self.sizes(), self.strides(), self.storage_offset());
+            }
+
+            new_fw_grad_value.copy_(new_grad);
+            new_grad = new_fw_grad_value;
+          }
+
+          base.set_fw_grad(new_base_fw_grad, level, /* is_inplace_op */ false);
+        }
+      }
+    }
+
+    // Enforce the basic layout constraint
+    if (!has_same_meta(new_grad, self)) {
+      Tensor new_grad_with_meta = new_with_same_meta(self);
+      new_grad_with_meta.copy_(new_grad);
+      new_grad = new_grad_with_meta;
+    }
+
+    fw_grad_->set_value(new_grad, level);
+  }
+}
+
+const Variable& AutogradMeta::fw_grad(uint64_t level, const Variable& self) const {
+  // Ensure that concurent fw_grad() "reads" are thread safe
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  const auto& direct_fw_grad = fw_grad_ ? fw_grad_->value(level) : ForwardGrad::undef_grad();
+
+  if (!direct_fw_grad.defined() && is_view_) {
+    // For view that don't have a forward grad, check if their base has one that
+    // has been defined by an inplace operation.
+    // This ensure that case 5 from [Forward Grad View/inplace] above works fine
+    auto const_view_meta = static_cast<const torch::autograd::DifferentiableViewMeta*>(this);
+    // This is ok to do as we ONLY modify fw_grad_ and this field is properly locked in all methods
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    auto this_view_meta = const_cast<torch::autograd::DifferentiableViewMeta*>(const_view_meta);
+    if (this_view_meta->has_fw_view()) {
+      const auto& view_info = this_view_meta->get_forward_view();
+      const auto& base = view_info.base_;
+
+      const auto& base_val = base.fw_grad(level);
+      if (base_val.defined()) {
+        // Lazy initialization of fw_grad_
+        this_view_meta->fw_grad_ = std::make_shared<ForwardGrad>();
+
+        Variable new_val;
+        if (view_info.has_view_fn()) {
+          new_val = view_info.view_fn()(base_val);
+        } else {
+          new_val = base_val.as_strided(self.sizes(), self.strides(), self.storage_offset());
+        }
+
+        this_view_meta->fw_grad_->set_value(new_val, level);
+        return this_view_meta->fw_grad_->value(level);
+      }
+    }
+  }
+  return direct_fw_grad;
+}
+
+}} // torch::autograd
diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp
index c7520315d9f3..f6d28ec342b6 100644
--- a/torch/csrc/autograd/custom_function.cpp
+++ b/torch/csrc/autograd/custom_function.cpp
@@ -124,7 +124,7 @@ variable_list _wrap_outputs(const variable_list &input_vars,
     if (!(is_input && is_modified) && var.is_view()) {
       // NB: is_view() ==> get_autograd_meta()
       auto diff_view_meta = static_cast<DifferentiableViewMeta*>(impl::get_autograd_meta(var));
-      diff_view_meta->creation_meta = CreationMeta::IN_CUSTOM_FUNCTION;
+      diff_view_meta->set_creation_meta(CreationMeta::IN_CUSTOM_FUNCTION);
     }
 
     if (is_differentiable) {
@@ -142,7 +142,7 @@ variable_list _wrap_outputs(const variable_list &input_vars,
       if (var.is_view()) {
         // NB: is_view() ==> get_autograd_meta()
         auto diff_view_meta = static_cast<DifferentiableViewMeta*>(impl::get_autograd_meta(var));
-        diff_view_meta->creation_meta = CreationMeta::MULTI_OUTPUT_NODE;
+        diff_view_meta->set_creation_meta(CreationMeta::MULTI_OUTPUT_NODE);
       }
     }
   }
diff --git a/torch/csrc/autograd/forward_grad.cpp b/torch/csrc/autograd/forward_grad.cpp
new file mode 100644
index 000000000000..bb8f19f252a8
--- /dev/null
+++ b/torch/csrc/autograd/forward_grad.cpp
@@ -0,0 +1,90 @@
+#include <torch/csrc/autograd/forward_grad.h>
+
+namespace torch { namespace autograd {
+
+namespace {
+    // See discussion in forward_grad.h for why these are global variables and not
+    // thread local
+
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+    static std::mutex all_forward_levels_mutex_;
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+    static uint64_t next_forward_idx_ = 0;
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+    static std::vector<std::shared_ptr<ForwardADLevel>> all_forward_levels_;
+
+    const static at::Tensor singleton_undefined_tensor;
+
+    // Temporary flag to disable forward mode
+    // TODO(alband) remove these when perf issues are solved
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+    static bool is_forward_grad_enabled = false;
+}
+
+uint64_t ForwardADLevel::get_next_idx() {
+    std::lock_guard<std::mutex> lock(all_forward_levels_mutex_);
+    TORCH_CHECK(next_forward_idx_ == 0, "Nested forward mode AD is not supported at the moment");
+    auto new_index = next_forward_idx_++;
+    TORCH_INTERNAL_ASSERT(new_index == all_forward_levels_.size());
+    all_forward_levels_.push_back(std::make_shared<ForwardADLevel>(new_index));
+    return new_index;
+}
+
+void ForwardADLevel::release_idx(uint64_t idx) {
+    std::lock_guard<std::mutex> lock(all_forward_levels_mutex_);
+    TORCH_CHECK(idx == all_forward_levels_.size() - 1, "Exiting a forward AD level that is not the "
+                "last that was created is not support. Ensure they are released in the reverse "
+                "order they were created.");
+    TORCH_CHECK(idx >= 0, "No forward AD level was created so you cannot exit it.");
+    next_forward_idx_--;
+    all_forward_levels_.pop_back();
+
+}
+std::shared_ptr<ForwardADLevel> ForwardADLevel::get_by_idx(uint64_t idx) {
+    std::lock_guard<std::mutex> lock(all_forward_levels_mutex_);
+    TORCH_CHECK(idx < all_forward_levels_.size(), "Trying to access a forward AD level with an invalid index. "
+                "This index was either not created or is already deleted.");
+    return all_forward_levels_[idx];
+}
+
+std::shared_ptr<ForwardADLevel> ForwardADLevel::try_get_by_idx(uint64_t idx) {
+    std::lock_guard<std::mutex> lock(all_forward_levels_mutex_);
+    if (idx < all_forward_levels_.size()) {
+        return all_forward_levels_[idx];
+    } else {
+        return nullptr;
+    }
+}
+
+ForwardADLevel::~ForwardADLevel() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto it = grads_.begin();
+    while (it != grads_.end()) {
+        // Warning this will lock *it mutex
+        // This is ok as this function is the *only* one to call back into another class's method.
+        (*it)->reset(idx_, /* update_level */ false);
+        it = grads_.erase(it);
+    }
+}
+
+const at::Tensor& ForwardGrad::value(uint64_t level) const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    const auto& it = content_.find(level);
+    return it == content_.end() ? singleton_undefined_tensor : (*it).second;
+}
+
+const at::Tensor& ForwardGrad::undef_grad() {
+    return singleton_undefined_tensor;
+}
+
+// Temporary functions to disable forward AD
+// TODO(alband) remove these when perf issues are solved
+bool isForwardADEnabled() {
+    return is_forward_grad_enabled;
+}
+
+void setForwardADEnabled(bool value) {
+    is_forward_grad_enabled = value;
+}
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/forward_grad.h b/torch/csrc/autograd/forward_grad.h
new file mode 100644
index 000000000000..2f0e66034f38
--- /dev/null
+++ b/torch/csrc/autograd/forward_grad.h
@@ -0,0 +1,193 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+
+namespace torch { namespace autograd {
+
+// [ Using ForwardGrad ]
+// ForwardGrad needs to be a shared_ptr to satisfy constraints of its inner design. But
+// this shared_ptr must be uniquely associated with the object that stores it (as of
+// writing, either AutogradMeta or SavedVariable). This object is called the "owning object"
+// in the discussions below. This owning object must call `ForwardGrad::clear()` when it
+// is destroyed to ensure that the ForwardGrad is properly de-allocated.
+
+struct ForwardGrad;
+
+// This file contains two classes that are used to store forward AD gradients and
+// ensure that they are scoped properly.
+// Because forward AD runs concurrently with the evaluation of the function, we need
+// a mechanism to separate different forward AD invocations and be able to compute the
+// right gradients. We model such invocations as levels here.
+// The particular scoping issue mentioned above has two main drivers:
+//   - Ensure that we can conveniently use forward AD within a high level API without
+//     leaking the forward AD states outside.
+//   - Ensure that we can keep the level that we expose to the user API simple (an integer
+//     that represents the nesting depth) while avoiding confusions when the level index
+//     is re-used.
+
+// The important external APIs from this file are:
+//   - ForwardADLevel::get_next_idx() that can be used to enter a new level and get its index
+//   - ForwardADLevel::release_idx() that can be used to exit a given level.
+//   - ForwardGrad() can be used to store a given forward gradient that will handle the level
+//     tracking automatically.
+
+// The basic implementation strategy is as follows:
+// Every tensor has a ForwardGrad, maintaining a map from levels to tangents.
+// ForwardGrad is responsible for registering itself to the appropriate ForwardADLevel when a new
+// tangent is added to it via ForwardGrad::set_value and to un-register itself from this same level
+// if that tangent is removed via ForwardGrad::reset.
+// The ForwardADLevel is created when a new level is entered via ForwardADLevel::get_next_idx.
+// A reference to the new ForwardADLevel is stored into a global (for the whole process) vector that
+// ensure it can be accessed via ForwardADLevel::get_by_idx. This reference is deleted when the index is
+// released by the user when calling ForwardADLevel::release_idx.
+// When it is destructed, the ForwardADLevel is responsible for clearing all the tangents for its
+// level stored in all the ForwardGrad that registered with it.
+//
+// This process-wide level design, compared to a thread local one, allows us to use very simple user facing
+// handle for the level (an int) while enabling cross-thread forward AD.
+// The only required synchronization for the user is when entering and exiting the levels.
+// Some discussion on alternative design is in https://github.com/pytorch/pytorch/pull/49097#discussion_r543716453
+// and can be refined in the future.
+
+// Correctness of concurrency:
+// Each class uses its own lock when reading or modifying internal storages. This allows in particular
+// to safely remove tangents from ForwardGrad when the ForwardADLevel is being exited.
+// We ensure no deadlock by ensuring that a methods never calls into another class's method while
+// the local class's lock is held except in one single case: calling from ForwardADLevel's destructor
+// into ForwardGrad::reset with update_level=false.
+
+// The lifetime of these objects is as follows:
+// The ForwardADLevel can be in three states:
+//      - Initialized: where one of its reference is held by the global vector and there may be more
+//        references held by temporary variables in ForwardGrad's methods.
+//      - About to be destructed: where "release_idx" has been called and the only reason for the
+//        ForwardADLevel not to be destructed right away is that some methods in ForwardGrad have
+//        owning reference to it. This is done so that a ForwardADLevel can never be destructed when
+//        a ForwardGrad is registered with it and in the process of adding something to its internal state.
+//      - Being destructed: Here the ForwardADLevel is not referenced anymore and can be safely reset
+//        all of the ForwardGrad. Note that we can have more than one reset being called here (which is ok)
+//        but we are guaranteed that there is at least one.
+// The ForwardGrad is simpler as there is no intermediary state and no special destructor for. The logic to
+// unregister it from the different ForwardADLevel is done when the owning object (AutogradMeta or
+// SavedVariable) is being destroyed.
+
+// Other considered design:
+// To avoid having the ForwardGrad::clear, we considered storing weak_ptr inside the ForwardADLevel. While this
+// would work, it would mean that the set inside the ForwardADLevel would only grow unless we do an
+// expensive linear scan to remove all the dangling weak pointers. Hence this approach was not used.
+
+// Data structures in this file are optimized for this maximum number of levels.
+// The number of levels corresponds to the degree of the gradient being
+// computed using forward AD and we don't expect more than second order gradients
+// to be common.
+#define EXPECTED_MAX_LEVEL 2
+
+struct TORCH_API ForwardADLevel {
+    ForwardADLevel(uint64_t idx): idx_(idx) {}
+    ~ForwardADLevel();
+
+    static uint64_t get_next_idx();
+    static void release_idx(uint64_t idx);
+    static std::shared_ptr<ForwardADLevel> get_by_idx(uint64_t idx);
+    static std::shared_ptr<ForwardADLevel> try_get_by_idx(uint64_t idx);
+
+    void erase(const std::shared_ptr<ForwardGrad>& grad) {
+        std::lock_guard<std::mutex> lock(mutex_);
+        grads_.erase(grad);
+    }
+
+    void insert(const std::shared_ptr<ForwardGrad>& grad) {
+        std::lock_guard<std::mutex> lock(mutex_);
+        grads_.insert(grad);
+    }
+
+private:
+    std::unordered_set<std::shared_ptr<ForwardGrad>> grads_;
+    std::mutex mutex_;
+    uint64_t idx_;
+
+};
+
+struct TORCH_API ForwardGrad : std::enable_shared_from_this<ForwardGrad> {
+
+    ForwardGrad() {}
+
+    // This function must only be called when AutogradMeta or SavedVariable is being
+    // destructed as it ensures that:
+    //   - The only (potential) other references to this ForwardGrad are the
+    //     different level it is registered to
+    //   - No other thread will try to call `set_value` or `value` ever from now on
+    //   - Any of the ForwardADLevel that this ForwardGrad is registered with might
+    //     call `reset` at any point during this function
+    void clear() {
+        c10::SmallVector<uint64_t, EXPECTED_MAX_LEVEL> levels_idx;
+
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            for (auto& c: content_) {
+                levels_idx.push_back(c.first);
+            }
+        }
+
+        for (auto l_idx: levels_idx) {
+            // Use "try" version here as another thread might have deleted this
+            // level before we got here
+            // This is an owning reference as we want to keep the level alive
+            // until we successfully unregister ourselves
+            auto level = ForwardADLevel::try_get_by_idx(l_idx);
+            if (level) {
+                level->erase(shared_from_this());
+            }
+        }
+    }
+
+    void set_value(const at::Tensor& value, uint64_t level) {
+        // Owning reference to ensure the forward_level is not destroyed
+        // while we are updating our internal state
+        auto forward_level = ForwardADLevel::get_by_idx(level);
+        forward_level->insert(shared_from_this());
+
+        std::lock_guard<std::mutex> lock(mutex_);
+        content_.insert({level, value});
+    }
+
+    // This function removes the tangent for a given level from this ForwardGrad
+    // Use the update_level flag to disable notifying the level about this reset
+    // This flag is most notably used by the ForwardADLevel destructor.
+    void reset(uint64_t level, bool update_level=true) {
+        if (update_level) {
+            ForwardADLevel::get_by_idx(level)->erase(shared_from_this());
+        }
+
+        std::lock_guard<std::mutex> lock(mutex_);
+        content_.erase(level);
+    }
+
+    const at::Tensor& value(uint64_t level) const;
+
+    bool contains(uint64_t level) {
+        std::lock_guard<std::mutex> lock(mutex_);
+        return content_.count(level) > 0;
+    }
+
+    bool empty() const {
+        return content_.empty();
+    }
+
+    static const at::Tensor& undef_grad();
+
+
+private:
+    // TODO(albanD): replace this with a SmallVector
+    std::unordered_map<uint64_t, at::Tensor> content_;
+    mutable std::mutex mutex_;
+
+};
+
+// Temporary functions to disable forward AD
+// TODO(alband) remove these when perf issues are solved
+bool TORCH_API isForwardADEnabled();
+void TORCH_API setForwardADEnabled(bool value);
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/functions/basic_ops.cpp b/torch/csrc/autograd/functions/basic_ops.cpp
index 6ce068bd58de..b5991b87f835 100644
--- a/torch/csrc/autograd/functions/basic_ops.cpp
+++ b/torch/csrc/autograd/functions/basic_ops.cpp
@@ -47,4 +47,8 @@ auto UndefinedGradBackward::apply(variable_list&& output_grads) -> variable_list
   return input_grads;
 }
 
+auto Identity::apply(variable_list&& grads) -> variable_list {
+  return std::move(grads);
+}
+
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/functions/basic_ops.h b/torch/csrc/autograd/functions/basic_ops.h
index 1a4615466ec2..8a312b7baf0c 100644
--- a/torch/csrc/autograd/functions/basic_ops.h
+++ b/torch/csrc/autograd/functions/basic_ops.h
@@ -83,4 +83,8 @@ struct TORCH_API GraphRoot : public Node {
   variable_list outputs;
 };
 
+struct TORCH_API Identity : public Node {
+  variable_list apply(variable_list&& inputs) override;
+};
+
 }}
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 10f419c9bee5..ca419522dff8 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -3,11 +3,15 @@
 #include <c10/core/DeviceType.h>
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/autograd/autograd.h>
 #include <torch/csrc/autograd/grad_mode.h>
 #include <ATen/autocast_mode.h>
 #include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/autograd/python_function.h>
 #include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/utils/wrap_outputs.h>
+#include <torch/csrc/autograd/utils/python_arg_parsing.h>
+#include <torch/csrc/utils/pycfunction_helpers.h>
 
 PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
   using namespace torch::autograd::profiler;
@@ -230,6 +234,26 @@ static PyObject * autocast_decrement_nesting(PyObject* _unused, PyObject *arg) {
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject * set_forward_AD_enabled(PyObject* _unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+  if (!PyBool_Check(arg)) {
+    throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
+  }
+  setForwardADEnabled(arg == Py_True);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * is_forward_AD_enabled(PyObject* _unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+  if (isForwardADEnabled()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject * set_grad_enabled(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   if (!PyBool_Check(arg)) {
@@ -270,10 +294,34 @@ static PyObject * is_anomaly_mode_enabled(PyObject* _unused, PyObject *arg) {
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject * python_enter_dual_level(PyObject* _unused, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  // It is unlikely that the depth of forward nesting will overflow int64_t so we
+  // just static cast here.
+  return utils::wrap(static_cast<int64_t>(forward_ad::enter_dual_level()));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * python_exit_dual_level(PyObject* _unused, PyObject* args, PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "exit_dual_level(int64_t level)"
+  });
+
+  ParsedArgs<1> parsed_args;
+  auto _r = parser.parse(args, kwargs, parsed_args);
+
+  forward_ad::exit_dual_level(_r.toInt64(0));
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 // autograd methods on torch._C
 static PyMethodDef methods[] = { // NOLINT
   {"_set_grad_enabled", set_grad_enabled, METH_O, nullptr},
   {"is_grad_enabled", is_grad_enabled, METH_NOARGS, nullptr},
+  {"_set_forward_AD_enabled", set_forward_AD_enabled, METH_O, nullptr},
+  {"_is_forward_AD_enabled", is_forward_AD_enabled, METH_NOARGS, nullptr},
   {"set_autocast_enabled", set_autocast_enabled, METH_O, nullptr},
   {"is_autocast_enabled", is_autocast_enabled, METH_NOARGS, nullptr},
   {"clear_autocast_cache", clear_autocast_cache, METH_NOARGS, nullptr},
@@ -281,6 +329,8 @@ static PyMethodDef methods[] = { // NOLINT
   {"autocast_decrement_nesting", autocast_decrement_nesting, METH_NOARGS, nullptr},
   {"set_anomaly_enabled", set_anomaly_mode_enabled, METH_O, nullptr},
   {"is_anomaly_enabled", is_anomaly_mode_enabled, METH_NOARGS, nullptr},
+  {"_enter_dual_level", python_enter_dual_level, METH_NOARGS, nullptr},
+  {"_exit_dual_level", castPyCFunctionWithKeywords(python_exit_dual_level), METH_VARARGS | METH_KEYWORDS, nullptr},
   {nullptr, nullptr, 0, nullptr}
 };
 
diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp
index d2d43bdc25c2..d8058a1748c5 100644
--- a/torch/csrc/autograd/saved_variable.cpp
+++ b/torch/csrc/autograd/saved_variable.cpp
@@ -24,6 +24,12 @@ SavedVariable::SavedVariable(const Variable& variable, bool is_output, bool is_i
     // These copies are all shared_ptr copies, so slightly more expensive.
     // Do them here instead of in the init list in case data is undefined.
     data_ = variable.tensor_data();
+    // TODO(albanD) This needs to be updated when moving to multiple levels
+    const auto& fw_grad = variable.fw_grad(/* level */ 0);
+    if (fw_grad.defined()) {
+      fw_grad_ = std::make_shared<ForwardGrad>();
+      fw_grad_->set_value(fw_grad, /* level */ 0);
+    }
     if (variable.is_leaf()) {
       grad_accumulator_ = impl::grad_accumulator(variable);
     } else if (!is_output) {
@@ -100,6 +106,16 @@ Variable SavedVariable::unpack(std::shared_ptr<Node> saved_for) const {
     throw std::logic_error("No grad accumulator for a saved leaf!");
   impl::set_grad_accumulator(var, grad_accumulator_);
 
+  // NB: var here is never a view so there is no need to make anything special
+  // for the case where the saved Tensor was a view. This whole argument relies
+  // on the fact that the Tensor returned by this function is never
+  // modified in-place.
+  if (fw_grad_ && !fw_grad_->empty()) {
+    // TODO(albanD) This needs to be updated when moving to multiple levels
+    auto new_fw_grad = fw_grad_->value(/* level */ 0);
+    var.set_fw_grad(new_fw_grad, /* level */ 0, /* is_inplace_op */ false);
+  }
+
   return var;
 }
 
diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h
index f9533d3629e0..dde0ffa18a21 100644
--- a/torch/csrc/autograd/saved_variable.h
+++ b/torch/csrc/autograd/saved_variable.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
+#include <torch/csrc/autograd/forward_grad.h>
 
 #include <ATen/ATen.h>
 
@@ -23,6 +24,12 @@ class TORCH_API SavedVariable {
   SavedVariable(const c10::optional<Variable>& variable, bool is_output, bool is_inplace_view=false);
   SavedVariable(SavedVariable&&) = default;
   SavedVariable& operator=(SavedVariable&&) = default;
+  ~SavedVariable() {
+    if (fw_grad_) {
+      // See note [ Using ForwardGrad ]
+      fw_grad_->clear();
+    }
+  }
 
   /// Reconstructs the saved variable. Pass `saved_for` as the gradient
   /// function if constructing the `SavedVariable` with it would have caused a
@@ -40,6 +47,11 @@ class TORCH_API SavedVariable {
  private:
   at::Tensor data_;
 
+  // This field is used to store the forward AD gradients associated with
+  // the saved Tensor. Note that this shared_ptr must never be shared with
+  // either the saved Tensor or the unpacked Tensor. See note [ Using ForwardGrad ]
+  std::shared_ptr<ForwardGrad> fw_grad_;
+
   // The gradient function associated with this node. If has_grad_fn
   // is false, then this is a leaf node. Note that the grad_fn is not saved if
   // it would create a circular reference. In that case, the grad_fn must be
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index 8977bcaf5920..e2e8f5c2b6c4 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -11,6 +11,7 @@
 #include <ATen/core/VariableHooksInterface.h>
 
 #include <ATen/ATen.h>
+#include <ATen/MemoryOverlap.h>
 #include <c10/util/Exception.h>
 
 #include <list>
@@ -20,28 +21,83 @@
 #include <string>
 #include <vector>
 #include <typeinfo>
+#include <iostream>
 
 namespace torch {
 namespace autograd {
 
 
-DifferentiableViewMeta::DifferentiableViewMeta(at::TensorImpl* self_impl, Variable base,
-  c10::optional<std::function<at::Tensor(const at::Tensor&)>> view_fn,
+DifferentiableViewMeta::DifferentiableViewMeta(at::TensorImpl* self_impl,
+  c10::optional<ViewInfo> backward_info,
+  c10::optional<ViewInfo> forward_info,
   CreationMeta creation_meta)
-    : AutogradMeta(self_impl), creation_meta(creation_meta) {
-  base_ = std::move(base);
-  view_fn_ = std::move(view_fn);
-  TORCH_CHECK(base_.defined(), "base is undefined");
-  if (base_.is_view()) {
-    base_ = base_._base();
-  }
+    : AutogradMeta(self_impl),
+      backward_info_(std::move(backward_info)),
+      forward_info_(std::move(forward_info)),
+      creation_meta(creation_meta) {
   is_view_ = true;
-  self_impl->set_version_counter(impl::version_counter(base_));
-  attr_version = self_impl->version_counter().current_version();
+  if (backward_info_.has_value()) {
+    self_impl->set_version_counter(impl::version_counter(backward_info_.value().base_));
+    attr_version = self_impl->version_counter().current_version();
+  }
 }
 
-DifferentiableViewMeta::~DifferentiableViewMeta() {
-  base_.reset();
+// Chain this view info with the new view op between base and tensor
+ViewInfo ViewInfo::chain(const Variable & base, const Variable & tensor,
+  c10::optional<std::function<Variable(const Variable&)>> view_func) const {
+  // Set `view_func` using the root base as input.
+  // `view_func` is used to recover views in backward when either as_strided is not supported
+  // or the view function changes the metadata which is not recorded by as_strided
+  // See Note [View + Inplace update on base tensor] and [View + Inplace update on view tensor]
+  // for more details how we use this function in backward.
+  if (view_func.has_value()) {
+    auto fn = view_func.value();
+    // both current_view and it's parent have a view_func
+    if (view_fn_.has_value()) {
+      auto prev_fn = view_fn_.value();
+      view_func = [=](const at::Tensor& root_base) {
+        auto temp = prev_fn(root_base);
+        return fn(temp);
+      };
+    } else {
+      // current_view has a view_func and but it's parent doesn't have one
+      if (base.unsafeGetTensorImpl()->support_as_strided()) {
+        auto size = base.sizes().vec();
+        auto stride = base.strides().vec();
+        auto storage_offset = base.storage_offset();
+        view_func = [=](const at::Tensor& root_base) {
+          auto temp = root_base.as_strided(size, stride, storage_offset);
+          return fn(temp);
+        };
+      } else {
+        // When base is a view but doesn't carry a view_fn in DifferentiableViewMeta, it's
+        // a view that doesn't support inplace update, e.g. unbind.
+        // In this case we should throw an error when inplace update happens in **forward**.
+        // One would naturally think the following function will be first called in backward pass.
+        // But the first call site is indeed in **forward** pass when we refresh `grad_fn`
+        // triggered by inplace update.
+        // Search Note [View + Inplace update for view tensor] to for the call site.
+        view_func = [=](const at::Tensor& root_base) {
+          TORCH_CHECK(false, "This view is the output of a function that returns multiple views."
+                  "Such functions do not allow the output views to be modified inplace."
+                  "You should replace the inplace operation by an out-of-place one");
+          return root_base;
+        };
+      }
+    }
+  } else if(view_fn_.has_value()) {
+    // if current_view doesn't have a view_func but it's parent has one
+    auto prev_view_fn = view_fn_.value();
+    auto size = tensor.sizes().vec();
+    auto stride = tensor.strides().vec();
+    auto storage_offset = tensor.storage_offset();
+    view_func = [=](const at::Tensor& root_base) {
+      auto temp = prev_view_fn(root_base);
+      return temp.as_strided(size, stride, storage_offset);
+    };
+  }
+
+  return ViewInfo(base_, view_func);
 }
 
 namespace {
@@ -81,21 +137,23 @@ namespace impl {
       auto diff_view_meta = static_cast<DifferentiableViewMeta*>(get_autograd_meta(self));
 
       // See NOTE [ View + Inplace detection ]
-      if (diff_view_meta->creation_meta != CreationMeta::MULTI_OUTPUT_SAFE) {
+      auto creation_meta = diff_view_meta->get_creation_meta();
+      if (creation_meta != CreationMeta::MULTI_OUTPUT_SAFE) {
         // Do not use handle_view_on_rebase here as check_inplace should have been called before this
         // and either throw an error or clear the warning
         // Temporary error message as a full fix is too risky for now
         // Should be an internal assert again
-        TORCH_INTERNAL_ASSERT(diff_view_meta->creation_meta == CreationMeta::DEFAULT);
+        TORCH_INTERNAL_ASSERT(creation_meta == CreationMeta::DEFAULT);
         TORCH_INTERNAL_ASSERT(gradient_edge.input_nr == 0);
         TORCH_INTERNAL_ASSERT(gradient_edge.function);
         TORCH_CHECK(
             gradient_edge.function->num_inputs() == 1,
             "Functions which modify views in-place must return a single Variable");
+        auto view_info = diff_view_meta->get_backward_view();
         diff_view_meta->output_nr_ = gradient_edge.input_nr;
         auto copy_slices = std::make_shared<CopySlices>(
-            diff_view_meta->base_, at::TensorGeometry(self), diff_view_meta->view_fn_, std::move(gradient_edge.function));
-        set_gradient_edge(diff_view_meta->base_, {std::move(copy_slices), 0});
+            view_info.base_, at::TensorGeometry(self), view_info.view_fn_, std::move(gradient_edge.function));
+        set_gradient_edge(view_info.base_, {std::move(copy_slices), 0});
         self.grad_fn(); // trigger an update to the view's grad_fn
         return;
       }
@@ -181,7 +239,7 @@ namespace impl {
     if (self.is_view()) {
       // NB: is_view() ==> get_autograd_meta()
       auto diff_view_meta = static_cast<torch::autograd::DifferentiableViewMeta*>(meta);
-      diff_view_meta->attr_version = self._version();
+      diff_view_meta->set_attr_version(self._version());
     }
   }
 
@@ -298,12 +356,14 @@ Tensor VariableHooks::tensor_data(const Tensor& self) const {
   return at::Tensor(self_impl_copy);
 }
 
-// View Variables
+// Backward View Variables
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 bool VariableHooks::is_view(const Tensor& self) const {
-  if (torch::autograd::impl::get_autograd_meta(self)) {
-    return torch::autograd::impl::get_autograd_meta(self)->is_view_;
+  auto meta = torch::autograd::impl::get_autograd_meta(self);
+  if (meta && meta->is_view_) {
+    auto diff_view_meta = static_cast<torch::autograd::DifferentiableViewMeta*>(meta);
+    return diff_view_meta->has_bw_view();
   } else {
     return false;
   }
@@ -313,9 +373,10 @@ const Tensor& VariableHooks::base(const Tensor& self) const {
   if (self.is_view()) {
     // is_view() implies get_autograd_meta()
     auto diff_view_meta = static_cast<torch::autograd::DifferentiableViewMeta*>(torch::autograd::impl::get_autograd_meta(self));
-    return diff_view_meta->base_;
+    TORCH_CHECK(diff_view_meta->has_bw_view(), "Can't get base of non-backward view Tensor");
+    return diff_view_meta->get_backward_view().base_;
   } else {
-    throw std::runtime_error("Can't get base of non-view Variable");
+    throw std::runtime_error("Can't get base of non-view Tensor");
   }
 }
 
@@ -342,13 +403,14 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(const Tenso
     auto diff_view_meta = static_cast<torch::autograd::DifferentiableViewMeta*>(torch::autograd::impl::get_autograd_meta(self));
 
     // See NOTE [ View + Inplace detection ]
-    if (diff_view_meta->creation_meta != CreationMeta::MULTI_OUTPUT_SAFE) {
+    if (diff_view_meta->get_creation_meta() != CreationMeta::MULTI_OUTPUT_SAFE) {
       std::lock_guard<std::mutex> lock(diff_view_meta->mutex_);
-      if (!diff_view_meta->grad_fn_ && !diff_view_meta->base_.requires_grad()) {
+      auto view_info = diff_view_meta->get_backward_view();
+      if (!diff_view_meta->grad_fn_ && !view_info.base_.requires_grad()) {
         return diff_view_meta->grad_fn_;
       }
       auto current_version = self._version();
-      if (diff_view_meta->attr_version != current_version) {
+      if (diff_view_meta->get_attr_version() != current_version) {
         // This is an indirect rebase_history due to another view or the base being modified inplace
         handle_view_on_rebase(diff_view_meta, /* indirect */ true);
         TORCH_INTERNAL_ASSERT(diff_view_meta->output_nr_ == 0);
@@ -377,24 +439,24 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(const Tenso
         //
         // TODO: Potentially the following logic can be replaced by special logic in VariableType_x.cpp
         //       that would provide a way to recreate the grad_fn chain.
-        if (diff_view_meta->has_view_fn()) {
-          auto view_fn = diff_view_meta->view_fn();
-          auto diff_view = view_fn(diff_view_meta->base_);
+        if (view_info.has_view_fn()) {
+          auto view_fn = view_info.view_fn();
+          auto diff_view = view_fn(view_info.base_);
           diff_view_meta->grad_fn_ = diff_view.grad_fn();
         } else {
           auto fn = std::make_shared<torch::autograd::generated::AsStridedBackward>();
-          fn->self_geometry = at::TensorGeometry(diff_view_meta->base_);
+          fn->self_geometry = at::TensorGeometry(view_info.base_);
           fn->size = self.sizes().vec();
           fn->stride = self.strides().vec();
           fn->storage_offset = self.storage_offset();
-          fn->set_next_edges(torch::autograd::collect_next_edges(diff_view_meta->base_));
+          fn->set_next_edges(torch::autograd::collect_next_edges(view_info.base_));
           fn->add_input_metadata(
-            diff_view_meta->base_.options(),
+            view_info.base_.options(),
             self.sizes(), // Note: sizes(), not base_.sizes(), is intentional
-            diff_view_meta->base_.device());
+            view_info.base_.device());
           diff_view_meta->grad_fn_ = std::move(fn);
         }
-        diff_view_meta->attr_version = current_version;
+        diff_view_meta->set_attr_version(current_version);
       }
       return diff_view_meta->grad_fn_;
     }
@@ -429,7 +491,8 @@ unsigned VariableHooks::_register_hook(const Tensor& self, std::function<Tensor(
 
 void handle_view_on_rebase(DifferentiableViewMeta* diff_view_meta, bool indirect) {
   /// See NOTE [ View + Inplace detection ] for justification of the logic below
-  if (diff_view_meta->creation_meta != CreationMeta::DEFAULT) {
+  auto creation_meta = diff_view_meta->get_creation_meta();
+  if (creation_meta != CreationMeta::DEFAULT) {
     auto grad_fn = diff_view_meta->grad_fn_.get();
     std::string msg;
     std::string modified_obj;
@@ -446,24 +509,24 @@ void handle_view_on_rebase(DifferentiableViewMeta* diff_view_meta, bool indirect
       msg = c10::str("A view was created in no_grad mode and ", modified_obj, " modified inplace with grad mode enabled.");
     }
 
-    if (diff_view_meta->creation_meta == CreationMeta::MULTI_OUTPUT_NODE) {
+    if (creation_meta == CreationMeta::MULTI_OUTPUT_NODE) {
       TORCH_CHECK(false, msg, " This view is the output of a function that returns multiple views. Such functions do not"
                          " allow the output views to be modified inplace. You should replace the inplace operation by an"
                          " out-of-place one.");
     } else {
-      if (diff_view_meta->creation_meta == CreationMeta::NO_GRAD_MODE) {
+      if (creation_meta == CreationMeta::NO_GRAD_MODE) {
         TORCH_INTERNAL_ASSERT(!grad_fn);
         msg = c10::str(msg, " Given that this use case is ambiguous and error-prone, it is deprecated and will be forbidden"
                        "  starting 1.6 (see https://github.com/pytorch/pytorch/pull/32839 for more details about this). You"
                        " can clarify your code and remove this warning by moving both the view and the inplace either both"
                        " inside the no_grad block (if you don't want the inplace to be tracked) or both outside (if you want"
                        " the inplace to be tracked).");
-      } else if (diff_view_meta->creation_meta == CreationMeta::IN_CUSTOM_FUNCTION) {
+      } else if (creation_meta == CreationMeta::IN_CUSTOM_FUNCTION) {
         msg = c10::str(msg, " This view was created inside a custom Function (or because an input was returned as-is) and the"
                        " autograd logic to handle view+inplace would override the custom backward associated with the custom"
                        " Function, leading to incorrect gradients. This behavior is deprecated and will be forbidden starting"
                        " version 1.6. You can remove this warning by cloning the output of the custom Function.");
-      } else if (diff_view_meta->creation_meta == CreationMeta::MULTI_OUTPUT_SAFE) {
+      } else if (creation_meta == CreationMeta::MULTI_OUTPUT_SAFE) {
         msg = c10::str(msg, " This view is an output of a function that "
                        "returns multiple views. Inplace operators on such "
                        "views are being deprecated and will be forbidden "
@@ -487,8 +550,10 @@ void handle_view_on_rebase(DifferentiableViewMeta* diff_view_meta, bool indirect
     // We warn only once per view
     // Note that if a Tensor is modified inplace from two threads at the same time, this is not thread safe and can warn
     // multiple time. This is ok as it should be a rare event.
-    diff_view_meta->creation_meta = CreationMeta::DEFAULT;
+    diff_view_meta->set_creation_meta(CreationMeta::DEFAULT);
   }
 }
 
+
+
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index cb8a763f246b..9c58f5c1a407 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -6,6 +6,7 @@
 #include <torch/csrc/autograd/edge.h>
 #include <torch/csrc/autograd/function_hook.h>
 #include <torch/csrc/autograd/cpp_hook.h>
+#include <torch/csrc/autograd/forward_grad.h>
 
 #include <ATen/ATen.h>
 #include <ATen/NamedTensorUtils.h>
@@ -193,6 +194,17 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
   std::shared_ptr<Node> grad_fn_;
   std::weak_ptr<Node> grad_accumulator_;
 
+  // This field is used to store all the forward AD gradients
+  // associated with this AutogradMeta (and the Tensor it corresponds to)
+  // There is a semantic 1:1 correspondence between AutogradMeta and
+  // ForwardGrad but:
+  //   - This field is lazily populated.
+  //   - This field is a shared_ptr but it must never be
+  //     shared by multiple Tensors. See Note [ Using ForwardGrad ]
+  // Any transition from not_initialized to initialized
+  // must be protected by mutex_
+  std::shared_ptr<ForwardGrad> fw_grad_;
+
   std::vector<std::shared_ptr<FunctionPreHook>> hooks_;
   std::shared_ptr<hooks_list> cpp_hooks_list;
 
@@ -211,9 +223,11 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
   uint32_t output_nr_;
 
   // Mutex to ensure that concurrent read operations that modify internal
-  // state are still thread-safe. Used by grad_fn() and
-  // grad_accumulator().
-  std::mutex mutex_;
+  // state are still thread-safe. Used by grad_fn(), grad_accumulator(),
+  // fw_grad() and set_fw_grad()
+  // This is mutable because we need to be able to acquire this from const
+  // version of this class for the functions above
+  mutable std::mutex mutex_;
 
   /// Sets the `requires_grad` property of `Variable`. This should be true for
   /// leaf variables that want to accumulate gradients, and false for all other
@@ -238,6 +252,10 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
     return grad_;
   }
 
+  const Variable& fw_grad(uint64_t level, const Variable& self) const override;
+
+  void set_fw_grad(const Variable& new_grad, const Variable& self, uint64_t level, bool is_inplace_op) override;
+
   AutogradMeta(at::TensorImpl* self_impl = nullptr, bool requires_grad = false, Edge gradient_edge = Edge() ) {
     grad_fn_ = std::move(gradient_edge.function);
     requires_grad_ = false;
@@ -254,6 +272,55 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
         !grad_fn_ || !requires_grad_,
         "requires_grad should be false if grad_fn is set");
   }
+
+  ~AutogradMeta() override {
+    // If AutogradMeta is being destroyed, it means that there is no other reference to its
+    // corresponding Tensor. It implies that no other thread can be using this object and so there is
+    // no need to lock mutex_ here to guard the check if fw_grad_ is populated.
+    if (fw_grad_) {
+      // See note [ Using ForwardGrad ]
+      fw_grad_->clear();
+    }
+  }
+};
+
+struct TORCH_API ViewInfo {
+  /// The base `Variable`
+  /// If this ViewInfo represents a forward (respectively backward) AD gradient,
+  /// then this Tensor cannot be a forward (respectively backward) view.
+  Variable base_;
+
+  /// By default we use as_strided to recover views which is more efficient.
+  /// view_fn is only saved when as_strided is not supported.
+  /// If view_fn has value, we use it to recover views in backward.
+  c10::optional<std::function<Variable(const Variable&)>> view_fn_;
+
+  /// Accessors for the view function
+  bool has_view_fn() const {
+    return view_fn_.has_value();
+  }
+
+  std::function<Variable(const Variable&)> view_fn() const {
+    TORCH_CHECK(has_view_fn(), "Can only access the view function if it exists.");
+    return view_fn_.value();
+  }
+
+  /// The chain function can be used to build a new ViewInfo for a differentiable view
+  /// function. It will return a new view info that accurately represents how "tensor" is
+  /// a view of this instance's "base_".
+  /// The "base" and "tensor" are respectively the input and output of the differentiable
+  /// view function that happened. They are required to properly set the optional
+  /// view_fn_ when it is not provided.
+  /// The "view_func", if provided, should be a function that allows to re-do the view
+  /// between "base" and "tensor".
+  ViewInfo chain(const Variable & base, const Variable & tensor,
+    c10::optional<std::function<Variable(const Variable&)>> view_func=c10::nullopt) const;
+
+  ViewInfo(Variable base, c10::optional<std::function<Variable(const Variable&)>> view_fn) :
+    base_(std::move(base)),
+    view_fn_(std::move(view_fn)) {
+    TORCH_CHECK(base_.defined(), "base is undefined");
+  }
 };
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -274,6 +341,27 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
 ///
 /// Differentiable Views
 /// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// This class allows to track both forward and backward AD differentiable views.
+/// These views can have different base as non-differentiable view for forward
+/// and backward mode AD are not the same.
+///
+/// Most function are either both forward and backward differentiable views (for
+/// example: view, select, narrow, transpose, etc) or both not forward and not
+/// backward differentiable views (for example: indices, values, eq, lt, etc).
+/// But there are also functions that are forward but not backward differentiable
+/// views (only detach for now) or functions that are backward but not forward
+/// differentiable view (only make_dual and unpack dual for now).
+///
+/// A concrete example of two views with different bases is as follow:
+///
+///     # Have:
+///     #   dual is a dual Tensor that is neither a forward or backward view
+///     detached_dual = dual.detach()
+///     view = detached_dual.view_as(dual)
+///     # The forward base of view is dual
+///     # The backward base of view is detached_dual
+///
+/// - Backward Mode View
 /// Differentiable views are the view variables where you want gradients to flow
 /// back to the base variables. Out-of-place operations on views are quite
 /// straightforward, but in-place ones are very tricky. Even if the base
@@ -300,6 +388,34 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
 ///                                              var[1] filled with all ones and
 ///                                              zeros everywhere else
 ///
+/// - Forward Mode View
+/// Forward differentiable views follow the same semantic as backward ones but
+/// show up differently as they are computed along with the forward evaluation.
+/// The hard examples above are thus very similar
+///
+///   (1) in-place operation on view, e.g.,
+///
+///     # Have:
+///     #   base is a regular Tensor
+///     #   var is a dual Tensor whose tangent is all ones
+///     base[1] = var  # i.e., base[1].copy_(var)
+///     # Now, base is a dual Tensor
+///     _, fw_grad = fwAD.unpack_dual(base) <- fw_grad should be a tensor with
+///                                              fw_grad[1] filled with all ones and
+///                                              zeros everywhere else
+///
+///   (2) in-place operation on base after view is created, e.g.,
+///
+///     # Have:
+///     #   base is a regular Tensor
+///     #   var is a dual Tensor whose tangent is all ones
+///     view = base[1]
+///     base.copy_(var)
+///     _, fw_grad = fwAD.unpack_dual(view) <- fw_grad should be an all ones tensor
+///
+/// See Note [Forward Grad View/inplace] for more details on how we handle these hard cases.
+///
+///
 /// DifferentiableViewMeta is created to support gradient tracking of
 /// such **in-place** operations. In particular,
 ///   + if an in-place op is done on base, the grad_fn field of the view may
@@ -392,37 +508,66 @@ enum class CreationMeta: uint8_t { DEFAULT, IN_CUSTOM_FUNCTION, MULTI_OUTPUT_NOD
 TORCH_API void handle_view_on_rebase(DifferentiableViewMeta* diff_view_meta, bool indirect=false);
 
 struct TORCH_API DifferentiableViewMeta : public AutogradMeta {
-  /// The base `Variable` (never a view).
-  Variable base_;
+private:
+  /// Informations about the views
+  c10::optional<ViewInfo> backward_info_;
+  c10::optional<ViewInfo> forward_info_;
+
+  /// The two following fields are extra information that we track to ensure that
+  /// any operation on this backward view is valid.
 
   /// The value of the version_counter at the time grad_fn was created. The
-  /// grad_fn field is stale if attr_version !=
-  /// version_counter.current_version().
+  /// grad_fn field is stale if attr_version != version_counter.current_version().
   uint32_t attr_version;
-
-  /// By default we use as_strided to recover views which is more efficient.
-  /// view_fn is only saved when as_strided is not supported.
-  /// If view_fn has value, we use it to recover views in backward.
-  c10::optional<std::function<at::Tensor(const at::Tensor&)>> view_fn_;
-
   CreationMeta creation_meta;
 
+public:
+  /// requires_grad is a backward AD field so we only use the view specific logic
+  /// for backward differentiable views
   bool requires_grad() const override {
-    return requires_grad_ || grad_fn_ || (is_view_ && base_.requires_grad());
+    return requires_grad_ || grad_fn_ || (has_bw_view() && get_backward_view().base_.requires_grad());
   }
 
-  bool has_view_fn() const {
-    return view_fn_.has_value();
+  bool has_bw_view() const {
+    return backward_info_.has_value();
   }
 
-  std::function<at::Tensor(const at::Tensor&)> view_fn() const {
-    TORCH_CHECK(has_view_fn(), "view_fn is not set.");
-    return view_fn_.value();
+  const ViewInfo& get_backward_view() const {
+    TORCH_CHECK(has_bw_view(), "backward view info can only exist for backward views.");
+    return backward_info_.value();
+  }
+
+  uint32_t get_attr_version() const {
+    TORCH_CHECK(has_bw_view(), "attr_version can only exist for backward views.");
+    return attr_version;
+  }
+
+  void set_attr_version(uint32_t new_attr_version) {
+    TORCH_CHECK(has_bw_view(), "attr_version can only exist for backward views.");
+    attr_version = new_attr_version;
+  }
+
+  CreationMeta get_creation_meta() const {
+    TORCH_CHECK(has_bw_view(), "creation_meta can only exist for backward views.");
+    return creation_meta;
   }
 
-  DifferentiableViewMeta(at::TensorImpl* self_impl, Variable base, c10::optional<std::function<at::Tensor(const at::Tensor&)>> view_fn,
-                         CreationMeta creation_meta=CreationMeta::DEFAULT);
-  ~DifferentiableViewMeta();
+  void set_creation_meta(CreationMeta new_creation_meta) {
+    TORCH_CHECK(has_bw_view(), "creation_meta can only exist for backward views.");
+    creation_meta = new_creation_meta;
+  }
+
+  bool has_fw_view() const {
+    return forward_info_.has_value();
+  }
+
+  const ViewInfo& get_forward_view() const {
+    TORCH_CHECK(has_fw_view(), "forward view info can only exist for forward views.");
+    return forward_info_.value();
+  }
+
+  DifferentiableViewMeta(at::TensorImpl* self_impl, c10::optional<ViewInfo> backward_info,
+    c10::optional<ViewInfo> forward_info, CreationMeta creation_meta=CreationMeta::DEFAULT);
 };
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -448,10 +593,11 @@ struct TORCH_API DifferentiableViewMeta : public AutogradMeta {
 // See NOTE [ Autograd View Variables ] for details.
 // Differentiable view. Track history with DifferentiableViewMeta.
 inline Variable make_variable_differentiable_view(
-    Variable base,
     const at::Tensor& data,
+    c10::optional<ViewInfo> backward_info,
+    c10::optional<ViewInfo> forward_info,
     CreationMeta creation_meta,
-    c10::optional<std::function<at::Tensor(const at::Tensor&)>> view_func = c10::nullopt) {
+    bool allow_tensor_metadata_change = true) {
   if (data.defined()) {
     // If we already did a TensorImpl allocation for data, just reuse it.
     // Otherwise(e.g tensor.swapdim(0, 0) when we return the same tensor as input),
@@ -461,14 +607,16 @@ inline Variable make_variable_differentiable_view(
     if (data.getIntrusivePtr().unique() && data.getIntrusivePtr()->unique_version()) {
       at::TensorImpl* data_impl = data.unsafeGetTensorImpl();
       data_impl->set_autograd_meta(std::make_unique<DifferentiableViewMeta>(
-        data_impl, std::move(base), std::move(view_func), creation_meta));
+      data_impl, std::move(backward_info), std::move(forward_info),
+      creation_meta));
       return data;
     } else {
       c10::intrusive_ptr<at::TensorImpl> data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
         /*version_counter=*/0,
         /*allow_tensor_metadata_change=*/true);
       data_impl_copy->set_autograd_meta(std::make_unique<DifferentiableViewMeta>(
-        data_impl_copy.get(), std::move(base), std::move(view_func), creation_meta));
+      data_impl_copy.get(), std::move(backward_info), std::move(forward_info),
+      creation_meta));
       return Variable(data_impl_copy);
     }
   }
diff --git a/torch/overrides.py b/torch/overrides.py
index 0400e7609efc..c0e34634fd67 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -181,6 +181,8 @@ def get_ignored_functions() -> Set[Callable]:
         torch.is_deterministic,
         torch.set_deterministic,
         torch.unify_type_list,
+        torch.make_dual,
+        torch.unpack_dual,
         Tensor.__delitem__,
         Tensor.__dir__,
         Tensor.__getattribute__,

From 21398fb6cbc7371814c2c64fb463c0c7c7639c18 Mon Sep 17 00:00:00 2001
From: Alex Suhan <asuhan@fb.com>
Date: Tue, 22 Dec 2020 12:37:21 -0800
Subject: [PATCH 05/45] Fix get_overlap_status for tensors without storage
 (#49638)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49638

Reviewed By: ngimel

Differential Revision: D25681908

Pulled By: asuhan

fbshipit-source-id: 2ea8623614f2f0027f6437cf2819ba1657464f54
---
 aten/src/ATen/MemoryOverlap.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/aten/src/ATen/MemoryOverlap.cpp b/aten/src/ATen/MemoryOverlap.cpp
index a9128e0e94ed..2269d9ae11dc 100644
--- a/aten/src/ATen/MemoryOverlap.cpp
+++ b/aten/src/ATen/MemoryOverlap.cpp
@@ -48,6 +48,9 @@ MemOverlapStatus get_overlap_status(TensorImpl* a, TensorImpl* b) {
   if (!a->is_contiguous() || !b->is_contiguous()) {
     return MemOverlapStatus::TOO_HARD;
   }
+  if (!a->has_storage() || !b->has_storage()) {
+    return MemOverlapStatus::NO;
+  }
   if (a->storage().data() == b->storage().data()) {
     const auto a_begin = static_cast<char*>(a->data());
     const auto a_end = a_begin + a->numel() * a->itemsize();

From 1451d84766ea26d6e789e11fadf2bc565624d4a0 Mon Sep 17 00:00:00 2001
From: pbialecki <pbialecki@nvidia.com>
Date: Tue, 22 Dec 2020 13:44:41 -0800
Subject: [PATCH 06/45] Minor doc fix: change truncating to rounding in TF32
 docs (#49625)

Summary:
Minor doc fix in clarifying that the input data is rounded not truncated.

CC zasdfgbnm ngimel

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49625

Reviewed By: mruberry

Differential Revision: D25668244

Pulled By: ngimel

fbshipit-source-id: ac97e41e0ca296276544f9e9f85b2cf1790d9985
---
 docs/source/notes/cuda.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 6deea675f265..34ee143a77d5 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -65,7 +65,7 @@ available on new NVIDIA GPUs since Ampere, internally to compute matmul (matrix
 and batched matrix multiplies) and convolutions.
 
 TF32 tensor cores are designed to achieve better performance on matmul and convolutions on
-`torch.float32` tensors by truncating input data to have 10 bits of mantissa, and accumulating
+`torch.float32` tensors by rounding input data to have 10 bits of mantissa, and accumulating
 results with FP32 precision, maintaining FP32 dynamic range.
 
 matmuls and convolutions are controlled separately, and their corresponding flags can be accessed at:

From 04e04abd06b715878635c97d39b7c361a1538a7d Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Tue, 22 Dec 2020 13:53:06 -0800
Subject: [PATCH 07/45] remove unused THCBlas (#49725)

Summary:
removes unused THCBlas, call `at::cuda::blas::gemm` directly where needed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49725

Reviewed By: mruberry

Differential Revision: D25680831

Pulled By: ngimel

fbshipit-source-id: d826f3f558b156f45f2a4864daf3f6d086bda78c
---
 BUILD.bazel                                   |  1 -
 aten/src/THC/CMakeLists.txt                   |  2 -
 aten/src/THC/THC.h                            |  1 -
 aten/src/THC/THCBlas.cu                       | 33 ----------
 aten/src/THC/THCBlas.h                        | 16 -----
 aten/src/THC/THCTensorIndex.cu                |  1 -
 aten/src/THC/THCTensorMathMagma.cu            |  2 -
 aten/src/THC/THCTensorMathMagma.cuh           |  2 -
 .../THCUNN/generic/SpatialConvolutionMM.cu    | 63 ++-----------------
 9 files changed, 5 insertions(+), 116 deletions(-)
 delete mode 100644 aten/src/THC/THCBlas.cu
 delete mode 100644 aten/src/THC/THCBlas.h

diff --git a/BUILD.bazel b/BUILD.bazel
index ec5111c5104d..b3faea487965 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -373,7 +373,6 @@ filegroup(
 filegroup(
     name = "thc_srcs_cu",
     srcs = [
-        "aten/src/THC/THCBlas.cu.cc",
         "aten/src/THC/THCReduceApplyUtils.cu.cc",
         "aten/src/THC/THCSleep.cu.cc",
         "aten/src/THC/THCSortUtils.cu.cc",
diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt
index 4ba4a4ce4456..8ceab78f5abe 100644
--- a/aten/src/THC/CMakeLists.txt
+++ b/aten/src/THC/CMakeLists.txt
@@ -41,7 +41,6 @@ set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS}
   ${CMAKE_CURRENT_SOURCE_DIR}/THCTensor.cpp
 
   ${CMAKE_CURRENT_SOURCE_DIR}/THCReduceApplyUtils.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/THCBlas.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/THCSleep.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/THCStorage.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/THCStorageCopy.cu
@@ -67,7 +66,6 @@ install(FILES
           THC.h
           ${CMAKE_CURRENT_BINARY_DIR}/THCGeneral.h
           THCGeneral.hpp
-          THCBlas.h
           THCSleep.h
           THCStorage.h
           THCStorageCopy.h
diff --git a/aten/src/THC/THC.h b/aten/src/THC/THC.h
index 79be433e1a84..7e522a599b9e 100644
--- a/aten/src/THC/THC.h
+++ b/aten/src/THC/THC.h
@@ -3,7 +3,6 @@
 
 #include <THC/THCGeneral.h>
 #include <THC/THCAllocator.h>
-#include <THC/THCBlas.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <THC/THCCachingHostAllocator.h>
 #include <THC/THCSleep.h>
diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu
deleted file mode 100644
index 99ee29d18766..000000000000
--- a/aten/src/THC/THCBlas.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-#include <THC/THCBlas.h>
-#include <THC/THCGeneral.h>
-#include <TH/THHalf.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDABlas.h>
-
-#include <algorithm>
-#include <mutex>
-
-#ifdef __HIP_PLATFORM_HCC__
-#include <hip/hip_version.h>
-#endif
-
-/* Level 3 */
-void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc)
-{
-  at::cuda::blas::gemm<float>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::Half alpha, at::Half *a, int64_t lda, at::Half *b, int64_t ldb, at::Half beta, at::Half *c, int64_t ldc)
-{
-  at::cuda::blas::gemm<at::Half>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void THCudaBlas_Bgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, at::BFloat16 *a, int64_t lda, at::BFloat16 *b, int64_t ldb, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc)
-{
-  at::cuda::blas::gemm<at::BFloat16>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc)
-{
-  at::cuda::blas::gemm<double>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h
deleted file mode 100644
index 52fb6ae0f964..000000000000
--- a/aten/src/THC/THCBlas.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef THC_BLAS_INC
-#define THC_BLAS_INC
-
-#include <THC/THCGeneral.h>
-#include <TH/THHalf.h>
-#include <c10/util/BFloat16.h>
-
-/* Level 3 */
-TORCH_CUDA_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc);
-TORCH_CUDA_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);
-
-TORCH_CUDA_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, THHalf alpha, THHalf *a, int64_t lda, THHalf *b, int64_t ldb, THHalf beta, THHalf *c, int64_t ldc);
-
-TORCH_CUDA_API void THCudaBlas_Bgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, at::BFloat16 *a, int64_t lda, at::BFloat16 *b, int64_t ldb, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc);
-
-#endif
diff --git a/aten/src/THC/THCTensorIndex.cu b/aten/src/THC/THCTensorIndex.cu
index dfd3a510e6e1..3bb429ed30e3 100644
--- a/aten/src/THC/THCTensorIndex.cu
+++ b/aten/src/THC/THCTensorIndex.cu
@@ -1,7 +1,6 @@
 #include <THC/THC.h>
 #include <THC/THCTensorMath.h>
 #include <THC/THCGeneral.h>
-#include <THC/THCBlas.h>
 #include <THC/THCTensorCopy.h>
 #include <TH/THHalf.h>
 #include <THC/THCApply.cuh>
diff --git a/aten/src/THC/THCTensorMathMagma.cu b/aten/src/THC/THCTensorMathMagma.cu
index a2fd5fe8baf5..ce6ca38afd2b 100644
--- a/aten/src/THC/THCTensorMathMagma.cu
+++ b/aten/src/THC/THCTensorMathMagma.cu
@@ -9,8 +9,6 @@
 
 #ifdef USE_MAGMA
 #include <magma.h>
-#else
-#include <THC/THCBlas.h>
 #endif
 
 #ifndef DIVUP
diff --git a/aten/src/THC/THCTensorMathMagma.cuh b/aten/src/THC/THCTensorMathMagma.cuh
index 08124d3d4c91..5ceac465c317 100644
--- a/aten/src/THC/THCTensorMathMagma.cuh
+++ b/aten/src/THC/THCTensorMathMagma.cuh
@@ -3,8 +3,6 @@
 
 #ifdef USE_MAGMA
 #include <magma.h>
-#else
-#include <THC/THCBlas.h>
 #endif
 
 #ifdef USE_MAGMA
diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
index 808780c4c84b..599b09853913 100644
--- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
+++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
@@ -190,16 +190,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     if (bias) {
-      #ifdef THC_REAL_IS_FLOAT
-      THCudaBlas_Sgemm(
-      #elif defined(THC_REAL_IS_HALF)
-      THCudaBlas_Hgemm(
-      #elif defined(THC_REAL_IS_DOUBLE)
-      THCudaBlas_Dgemm(
-      #elif defined(THC_REAL_IS_BFLOAT16)
-      THCudaBlas_Bgemm(
-      #endif
-          state,
+      at::cuda::blas::gemm<scalar_t>(
           't', 'n',
           n_, m_, k_,
           ScalarConvert<int, scalar_t>::to(1),
@@ -234,16 +225,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     auto gemm_in_ptr = (kW != 1 || kH != 1) ?
         THCTensor_(data)(state, columns) : THCTensor_(data)(state, input_n);
-    #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sgemm(
-    #elif defined(THC_REAL_IS_HALF)
-    THCudaBlas_Hgemm(
-    #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dgemm(
-    #elif defined(THC_REAL_IS_BFLOAT16)
-    THCudaBlas_Bgemm(
-    #endif
-        state,
+    at::cuda::blas::gemm<scalar_t>(
         'n', 'n',
         n, m, k,
         ScalarConvert<int, scalar_t>::to(1),
@@ -332,16 +314,7 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
     int64_t k = nOutputPlane;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sgemm(
-    #elif defined(THC_REAL_IS_HALF)
-    THCudaBlas_Hgemm(
-    #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dgemm(
-    #elif defined(THC_REAL_IS_BFLOAT16)
-    THCudaBlas_Bgemm(
-    #endif
-        state,
+    at::cuda::blas::gemm<scalar_t>(
         'n', 't',
         n, m, k,
         ScalarConvert<int, scalar_t>::to(1),
@@ -471,16 +444,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
       // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
       auto gemm_in_ptr = (kW != 1 || kH != 1) ?
           THCTensor_(data)(state, columns) : THCTensor_(data)(state, input_n);
-      #ifdef THC_REAL_IS_FLOAT
-      THCudaBlas_Sgemm(
-      #elif defined(THC_REAL_IS_HALF)
-      THCudaBlas_Hgemm(
-      #elif defined(THC_REAL_IS_DOUBLE)
-      THCudaBlas_Dgemm(
-      #elif defined(THC_REAL_IS_BFLOAT16)
-      THCudaBlas_Bgemm(
-      #endif
-          state,
+      at::cuda::blas::gemm<scalar_t>(
           't', 'n',
           n, m, k,
           scale,
@@ -499,7 +463,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
       int64_t k_ = outputHeight * outputWidth;
 
       // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+      //#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_BFLOAT16)
       at::cuda::blas::gemv<scalar_t>(
           't',
           k_, m_,
@@ -509,23 +473,6 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
           ScalarConvert<int, scalar_t>::to(1),
           THCTensor_(data)(state, gradBias), 1
       );
-      #endif
-      #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_BFLOAT16)
-      #ifdef THC_REAL_IS_HALF
-      THCudaBlas_Hgemm(
-      #elif defined(THC_REAL_IS_BFLOAT16)
-      THCudaBlas_Bgemm(
-      #endif
-          state,
-          't', 'n',
-          m_, 1, k_,
-          scale,
-          THCTensor_(data)(state, gradOutput_n), k_,
-          THCTensor_(data)(state, ones), k_,
-          ScalarConvert<int, scalar_t>::to(1),
-          THCTensor_(data)(state, gradBias), m_
-      );
-      #endif
     }
   }
 

From 6f9532dd53668e28ce23657c52d383e00439400a Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Tue, 22 Dec 2020 14:13:37 -0800
Subject: [PATCH 08/45] only upload s3 stats on master, nightly, and release
 branch (#49645)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49645

Reviewed By: malfet

Differential Revision: D25665851

Pulled By: walterddr

fbshipit-source-id: 1cf50f6e3657f70776aaf3c5d3823c8a586bf22d
---
 test/print_test_stats.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/test/print_test_stats.py b/test/print_test_stats.py
index a1e6552c0710..95b267028fae 100755
--- a/test/print_test_stats.py
+++ b/test/print_test_stats.py
@@ -135,9 +135,22 @@ def send_report_to_scribe(reports):
 def send_report_to_s3(reports, *, total_seconds):
     job = os.environ.get('CIRCLE_JOB')
     sha1 = os.environ.get('CIRCLE_SHA1')
+    branch = os.environ.get('CIRCLE_BRANCH', '')
+    if branch not in ['master', 'nightly'] and not branch.startswith("release/"):
+        print("S3 upload only enabled on master, nightly and release branches.")
+        print(f"skipping test report on branch: {branch}")
+        return
     now = datetime.datetime.utcnow().isoformat()
     key = f'test_time/{sha1}/{job}/{now}Z.json.bz2'  # Z meaning UTC
-    obj = boto3.resource('s3').Object('ossci-metrics', key)
+    s3 = boto3.resource('s3')
+    try:
+        s3.get_bucket_acl(Bucket='ossci-metrics')
+    except Exception as e:
+        print(f"AWS ACL failed: {e}")
+    print("AWS credential found, uploading to S3...")
+
+    obj = s3.Object('ossci-metrics', key)
+    print("")
     # use bz2 because the results are smaller than gzip, and the
     # compression time penalty we pay is only about half a second for
     # input files of a few megabytes in size like these JSON files, and

From 9b6fb856e82832c44e6a08861ae7595e4fdf475c Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 22 Dec 2020 14:14:58 -0800
Subject: [PATCH 09/45] Update NNPACK (#49749)

Summary:
This update enables NNPACK cross compilation on MacOS

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49749

Reviewed By: janeyx99

Differential Revision: D25683056

Pulled By: malfet

fbshipit-source-id: c7a6b7f49d61a9a0697d67f6319f06bd252b66a5
---
 third_party/NNPACK | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/NNPACK b/third_party/NNPACK
index 57616b9a0ef7..c07e3a040071 160000
--- a/third_party/NNPACK
+++ b/third_party/NNPACK
@@ -1 +1 @@
-Subproject commit 57616b9a0ef7b0f8e56bfe7e9738744b52fe1828
+Subproject commit c07e3a0400713d546e0dea2d5466dd22ea389c73

From be091600eddada5c47377b93db922263910840c7 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Tue, 22 Dec 2020 14:31:55 -0800
Subject: [PATCH 10/45] early terminate when CUDA assert were thrown (#49527)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/49019

I marked the test_testing function as slow since it took ~1 minute to finish the subprocess test suite.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49527

Reviewed By: malfet

Differential Revision: D25623219

Pulled By: walterddr

fbshipit-source-id: 1b414623ecce14aace5e0996d5e4768a40e12e06
---
 test/test_testing.py                          | 52 ++++++++++++++++++-
 torch/testing/_internal/common_device_type.py | 14 +++++
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/test/test_testing.py b/test/test_testing.py
index b87345186cb3..9285166cb15e 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -3,9 +3,9 @@
 import math
 
 from torch.testing._internal.common_utils import \
-    (TestCase, run_tests, make_tensor)
+    (TestCase, make_tensor, run_tests, slowTest)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, onlyOnCPUAndCUDA, dtypes)
+    (instantiate_device_type_tests, onlyCUDA, onlyOnCPUAndCUDA, dtypes)
 
 # For testing TestCase methods and torch.testing functions
 class TestTesting(TestCase):
@@ -438,6 +438,54 @@ def test_assert_messages(self, device):
         self.assertEqual("no_user_msg", self._get_assert_msg(msg=None, debug_msg="no_user_msg"))
         self.assertEqual("debug_msg\nuser_msg", self._get_assert_msg(msg="user_msg", debug_msg="debug_msg"))
 
+    @onlyCUDA
+    @slowTest
+    def test_cuda_assert_should_stop_test_suite(self, device):
+        # This test is slow because it spawn another process to run another test suite.
+        import subprocess
+        import sys
+
+        problematic_test_script = """\
+#!/usr/bin/env python
+
+import torch
+
+from torch.testing._internal.common_utils import (TestCase, run_tests)
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+
+# This test is added to ensure that test suite terminates early when
+# CUDA assert was thrown since all subsequent test will fail.
+# See: https://github.com/pytorch/pytorch/issues/49019
+# This test file should be invoked from test_testing.py
+class TestThatContainsCUDAAssertFailure(TestCase):
+
+    def test_throw_unrecoverable_cuda_exception(self, device):
+        x = torch.rand(10, device=device)
+        # cause unrecoverable CUDA exception, recoverable on CPU
+        y = x[torch.tensor([25])].cpu()
+
+    def test_trivial_passing_test_case_on_cpu_cuda(self, device):
+        x1 = torch.tensor([0., 1.], device=device)
+        x2 = torch.tensor([0., 1.], device='cpu')
+        self.assertEqual(x1, x2)
+
+instantiate_device_type_tests(
+    TestThatContainsCUDAAssertFailure,
+    globals(),
+    except_for=None
+)
+
+if __name__ == '__main__':
+    run_tests()
+"""
+
+        # Test running of cuda assert test suite should early terminate.
+        p = subprocess.run([sys.executable, '-c', problematic_test_script], capture_output=True, timeout=120)
+        # should capture CUDA error
+        self.assertIn('CUDA error: device-side assert triggered', p.stderr.decode('ascii'))
+        # should run only 3 tests - 2 CPUs and 1 CUDA (remaining CUDA test should skip)
+        self.assertIn('Ran 3 tests', p.stderr.decode('ascii'))
+
 instantiate_device_type_tests(TestTesting, globals())
 
 if __name__ == '__main__':
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 36f02eff0c0f..73185116a4f5 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -187,6 +187,9 @@ def _construct_test_name(test_name, op, device_type, dtype):
 class DeviceTypeTestBase(TestCase):
     device_type: str = 'generic_device_type'
 
+    # Flag to disable test suite early due to unrecoverable error such as CUDA error.
+    _stop_test_suite = False
+
     # Precision is a thread-local setting since it may be overridden per test
     _tls = threading.local()
     _tls.precision = TestCase._precision
@@ -271,6 +274,11 @@ def instantiated_test(self, name=name, test=test_fn, dtype=dtype, op=op):
                     self.precision = self._get_precision_override(test_fn, dtype)
                     args = (arg for arg in (device_arg, dtype, op) if arg is not None)
                     result = test_fn(self, *args)
+                except RuntimeError as rte:
+                    if 'CUDA error: device-side assert triggered' in rte.__repr__():
+                        self._stop_test_suite = True
+                    # raise the runtime error as is.
+                    raise rte
                 finally:
                     self.precision = guard_precision
 
@@ -313,6 +321,12 @@ def instantiated_test(self, name=name, test=test_fn, dtype=dtype, op=op):
             for dtype in dtypes:
                 instantiate_test_helper(cls, name, test=test, dtype=dtype, op=None)
 
+    def run(self, result=None):
+        super().run(result=result)
+        # Early terminate test if _stop_test_suite is set.
+        if self._stop_test_suite:
+            result.stop()
+
 
 class CPUTestBase(DeviceTypeTestBase):
     device_type = 'cpu'

From 27804009049f0fa440ae3554fef81495bd71e211 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Tue, 22 Dec 2020 15:01:53 -0800
Subject: [PATCH 11/45] [numpy] Add `torch.xlogy` (#48777)

Summary:
Reference https://github.com/pytorch/pytorch/issues/38349
Fixes https://github.com/pytorch/pytorch/issues/22656

TODO:
* [x] Add docs
* [x] Add tests

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48777

Reviewed By: ngimel

Differential Revision: D25681346

Pulled By: mruberry

fbshipit-source-id: 369e0a29ac8a2c44de95eec115bf75943fe1aa45
---
 aten/src/ATen/core/aten_interned_strings.h    |   1 +
 aten/src/ATen/native/BinaryOps.cpp            |  38 +++++++
 aten/src/ATen/native/BinaryOps.h              |   1 +
 aten/src/ATen/native/cpu/BinaryOpsKernel.cpp  |  15 +++
 .../ATen/native/cuda/BinaryMiscOpsKernels.cu  |  16 +++
 aten/src/ATen/native/native_functions.yaml    |  50 +++++++++
 docs/source/tensors.rst                       |   2 +
 docs/source/torch.rst                         |   1 +
 test/test_autograd.py                         |  50 ++++++++-
 test/test_binary_ufuncs.py                    | 105 +++++++++++++++++-
 tools/autograd/derivatives.yaml               |  10 ++
 torch/_tensor_docs.py                         |  14 +++
 torch/_torch_docs.py                          |  42 +++++++
 torch/overrides.py                            |   1 +
 .../_internal/common_methods_invocations.py   |  29 ++++-
 15 files changed, 366 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index 7b0759c3671b..644d75c04c06 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -436,6 +436,7 @@ _(aten, logdet) \
 _(aten, logit) \
 _(aten, logspace) \
 _(aten, logsumexp) \
+_(aten, xlogy) \
 _(aten, lstm) \
 _(aten, lstm_cell) \
 _(aten, lstsq) \
diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index e8751be55387..9103eafb1f12 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -62,6 +62,7 @@ DEFINE_DISPATCH(igammac_stub);
 DEFINE_DISPATCH(nextafter_stub);
 DEFINE_DISPATCH(heaviside_stub);
 DEFINE_DISPATCH(copysign_stub);
+DEFINE_DISPATCH(xlogy_stub);
 
 static Tensor wrapped_scalar_tensor(Scalar scalar) {
   auto tensor = scalar_to_tensor(scalar);
@@ -1101,5 +1102,42 @@ Tensor& ldexp_(Tensor& self, const Tensor& other) {
   return at::ldexp_out(self, self, other);
 }
 
+Tensor& xlogy_out(Tensor& result, const Tensor& self, const Tensor& other) {
+  auto iter = TensorIterator::binary_float_op(result, self, other);
+  xlogy_stub(iter.device_type(), iter);
+  return result;
+}
+
+Tensor& xlogy_out(Tensor& result, Scalar self, const Tensor& other) {
+  return at::xlogy_out(result, c10::scalar_to_tensor(self, other.device()), other);
+}
+
+Tensor& xlogy_out(Tensor& result, const Tensor& self, Scalar other) {
+  return at::xlogy_out(result, self, c10::scalar_to_tensor(other, self.device()));
+}
+
+Tensor xlogy(const Tensor& x, const Tensor& y) {
+  Tensor result;
+  auto iter = TensorIterator::binary_float_op(result, x, y);
+  xlogy_stub(iter.device_type(), iter);
+  return iter.output();
+}
+
+Tensor xlogy(Scalar x, const Tensor& y) {
+  return at::xlogy(c10::scalar_to_tensor(x, y.device()), y);
+}
+
+Tensor xlogy(const Tensor& x, Scalar y) {
+  return at::xlogy(x, c10::scalar_to_tensor(y, x.device()));
+}
+
+Tensor& xlogy_(Tensor& x, const Tensor& y) {
+  return at::xlogy_out(x, x, y);
+}
+
+Tensor& xlogy_(Tensor& x, Scalar y) {
+  return at::xlogy_out(x, x, c10::scalar_to_tensor(y, x.device()));
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h
index 1fdb80590b5a..191611875f08 100644
--- a/aten/src/ATen/native/BinaryOps.h
+++ b/aten/src/ATen/native/BinaryOps.h
@@ -74,5 +74,6 @@ DECLARE_DISPATCH(binary_fn, igammac_stub);
 DECLARE_DISPATCH(binary_fn, nextafter_stub);
 DECLARE_DISPATCH(binary_fn, heaviside_stub);
 DECLARE_DISPATCH(binary_fn, copysign_stub);
+DECLARE_DISPATCH(binary_fn, xlogy_stub);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index ddfa8a2d3d95..3dfe130ced70 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -818,6 +818,20 @@ void copysign_kernel(TensorIterator& iter) {
   });
 }
 
+void xlogy_kernel(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "xlogy_cpu", [&]() {
+    cpu_kernel(iter, [](scalar_t x, scalar_t y) -> scalar_t {
+      if (at::_isnan(y)){
+        return NAN;
+      }
+      if (x == 0){
+        return 0;
+      }
+      return x * std::log(y);
+    });
+  });
+}
+
 } // namespace
 
 REGISTER_DISPATCH(add_stub, &add_kernel);
@@ -859,6 +873,7 @@ REGISTER_DISPATCH(igammac_stub, &igammac_kernel);
 REGISTER_DISPATCH(nextafter_stub, &nextafter_kernel);
 REGISTER_DISPATCH(heaviside_stub, &heaviside_kernel);
 REGISTER_DISPATCH(copysign_stub, &copysign_kernel);
+REGISTER_DISPATCH(xlogy_stub, &xlogy_kernel);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
index c0efde1671d1..2379877e91ba 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
@@ -3,6 +3,7 @@
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/BinaryOps.h>
+#include <ATen/NumericUtils.h>
 
 // NOTE: CUDA on Windows requires that the enclosing function
 // of a __device__ lambda not have internal linkage.
@@ -29,8 +30,23 @@ void mse_kernel_cuda(TensorIterator& iter) {
   });
 }
 
+void xlogy_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "xlogy_cuda", [&]() {
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t x, scalar_t y) -> scalar_t {
+      if (at::_isnan(y)){
+        return NAN;
+      }
+      if (x == 0){
+        return 0;
+      }
+      return x * std::log(y);
+    });
+  });
+}
+
 REGISTER_DISPATCH(smooth_l1_stub, &smooth_l1_kernel_cuda);
 REGISTER_DISPATCH(mse_stub, &mse_kernel_cuda);
+REGISTER_DISPATCH(xlogy_stub, &xlogy_kernel_cuda);
 
 // DO NOT ADD ANY NEW KERNELS HERE
 // CUDA compilation times grow quickly.  It's perfectly acceptable to have a file per kernel.
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 78ad11229428..9c0053f40b7e 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2560,6 +2560,56 @@
   dispatch:
     DefaultBackend: logaddexp2
 
+- func: xlogy.Tensor(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    CPU, CUDA: xlogy
+
+- func: xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  dispatch:
+    CPU, CUDA: xlogy
+
+- func: xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    CPU, CUDA: xlogy
+
+# xlogy: inplace variant
+- func: xlogy_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    CPU, CUDA: xlogy_
+
+- func: xlogy_.Scalar_Other(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    CPU, CUDA: xlogy_
+
+# xlogy: out variant
+- func: xlogy.OutTensor(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  variants: function
+  dispatch:
+    CPU, CUDA: xlogy_out
+
+- func: xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  variants: function
+  dispatch:
+    CPU, CUDA: xlogy_out
+
+- func: xlogy.OutScalar_Other(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  variants: function
+  dispatch:
+    CPU, CUDA: xlogy_out
+
 - func: logdet(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index f73753743d59..315cc9dc5309 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -645,6 +645,8 @@ view of a storage and defines numeric operations on it.
    .. automethod:: view
    .. automethod:: view_as
    .. automethod:: where
+   .. automethod:: xlogy
+   .. automethod:: xlogy_
    .. automethod:: zero_
 
 .. class:: BoolTensor()
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index c82035eb8684..3057339aa811 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -350,6 +350,7 @@ Pointwise Ops
     tanh
     true_divide
     trunc
+    xlogy
 
 Reduction Ops
 ~~~~~~~~~~~~~~~~~~~~~~
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 3d29529cab9a..a8a130596855 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -10,7 +10,7 @@
 import warnings
 from copy import deepcopy
 from collections import OrderedDict
-from itertools import product
+from itertools import product, permutations
 from operator import mul
 from functools import reduce
 import torch
@@ -7396,6 +7396,54 @@ def test_atleast(self, device):
         self._test_atleast(device, torch.atleast_2d)
         self._test_atleast(device, torch.atleast_3d)
 
+    def test_xlogy(self, device):
+
+        def _tensor_tensor_helper(x, y):
+            gradcheck(lambda x, y: torch.xlogy(x, y), (x, y))
+            gradgradcheck(lambda x, y: torch.xlogy(x, y), (x, y))
+
+            with torch.no_grad():
+                x = x.clone()
+                x[torch.rand_like(x) > 0.5] = 0
+
+            gradcheck(lambda y: torch.xlogy(x, y), (y))
+            gradgradcheck(lambda y: torch.xlogy(x, y), (y))
+
+        shapes = ((4,), (1, 4), (1, 1, 4), (1, 1, 1, 4))
+
+        # For broadcastible shapes and scalar.
+        for x_shape, y_shape in permutations(shapes, 2):
+            x = torch.rand(*x_shape, dtype=torch.double, device=device, requires_grad=True)
+            y = torch.rand(*y_shape, dtype=torch.double, device=device, requires_grad=True)
+
+            _tensor_tensor_helper(x, y)
+            _tensor_tensor_helper(y, x)
+
+            gradcheck(lambda y: torch.xlogy(0, y), (y))
+            gradgradcheck(lambda y: torch.xlogy(0, y), (y))
+
+            gradcheck(lambda y: torch.xlogy(2, y), (y))
+            gradgradcheck(lambda y: torch.xlogy(2, y), (y))
+            gradcheck(lambda y: torch.xlogy(y, 2), (y))
+            gradgradcheck(lambda y: torch.xlogy(y, 2), (y))
+
+        # Different shape
+        x = torch.rand(2, 3, 4, 5, dtype=torch.double, device=device, requires_grad=True)
+        y = torch.rand(4, 5, dtype=torch.double, device=device, requires_grad=True)
+        _tensor_tensor_helper(x, y)
+        _tensor_tensor_helper(y, x)
+        _tensor_tensor_helper(x, x)
+        _tensor_tensor_helper(y, y)
+
+        # Same shape
+        x = torch.rand(4, 5, dtype=torch.double, device=device, requires_grad=True)
+        y = torch.rand(4, 5, dtype=torch.double, device=device, requires_grad=True)
+        _tensor_tensor_helper(x, y)
+        _tensor_tensor_helper(y, x)
+        _tensor_tensor_helper(x, x)
+        _tensor_tensor_helper(y, y)
+
+
 class TestMultithreadAutograd(TestCase):
     def _run_py_multithread_fn(self, fn, args=(), num_threads=10, kwargs=None):
         threads = []
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index 9888c29130bb..5739fb569628 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -8,15 +8,19 @@
 import unittest
 import warnings
 import operator
+from functools import partial
 
 from torch._six import inf, nan
 from torch.testing._internal.common_utils import (
     TestCase, iter_indices, TEST_WITH_ASAN, run_tests,
-    torch_to_numpy_dtype_dict, make_tensor)
+    torch_to_numpy_dtype_dict, make_tensor, TEST_SCIPY)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA,
     dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyOnCPUAndCUDA,
-    skipCUDAIfRocm)
+    skipCUDAIfRocm, skipIf)
+
+if TEST_SCIPY:
+    import scipy.special
 
 # TODO: remove this
 def _generate_input(shape, dtype, device, with_extremal):
@@ -2488,6 +2492,103 @@ def _promo_helper(x, y):
                     with self.assertRaisesRegex(RuntimeError, "is not the desired type"):
                         torch.Tensor.float_power_(base.clone(), exp)
 
+    @skipIf(not TEST_SCIPY, "Scipy required for the test.")
+    @dtypes(*product(torch.testing.get_all_dtypes(include_complex=False, include_bfloat16=False),
+                     torch.testing.get_all_dtypes(include_complex=False, include_bfloat16=False)))
+    def test_xlogy(self, device, dtypes):
+        def out_variant_helper(torch_fn, x, y):
+            expected = torch_fn(x, y)
+            out = torch.empty_like(expected)
+            torch_fn(x, y, out=out)
+            self.assertEqual(expected, out)
+
+        def inplace_variant_helper(x, y):
+            if x.dtype in torch.testing.get_all_int_dtypes() + [torch.bool]:
+                with self.assertRaisesRegex(RuntimeError,
+                                            "can't be cast to the desired output type"):
+                    x.clone().xlogy_(y)
+            else:
+                expected = torch.empty_like(x)
+                torch.xlogy(x, y, out=expected)
+                inplace_out = x.clone().xlogy_(y)
+                self.assertEqual(expected, inplace_out)
+
+        x_dtype, y_dtype = dtypes
+
+        # Tensor-Tensor Test (tensor of same and different shape)
+        x = make_tensor((3, 2, 4, 5), device, x_dtype, low=0.5, high=1000)
+        y = make_tensor((3, 2, 4, 5), device, y_dtype, low=0.5, high=1000)
+        z = make_tensor((4, 5), device, y_dtype, low=0.5, high=1000)
+
+        torch_fn = partial(torch.xlogy, x)
+        reference_fn = partial(scipy.special.xlogy, x.cpu().numpy())
+
+        self.compare_with_numpy(torch_fn, reference_fn, x, exact_dtype=False)
+        self.compare_with_numpy(torch_fn, reference_fn, y, exact_dtype=False)
+        self.compare_with_numpy(torch_fn, reference_fn, z, exact_dtype=False)
+        out_variant_helper(torch.xlogy, x, x)
+        out_variant_helper(torch.xlogy, x, y)
+        out_variant_helper(torch.xlogy, x, z)
+        inplace_variant_helper(x, x)
+        inplace_variant_helper(x, y)
+        inplace_variant_helper(x, z)
+
+        # Scalar-Tensor Test
+        torch_fn = partial(torch.xlogy, 3.14)
+        reference_fn = partial(scipy.special.xlogy, 3.14)
+
+        self.compare_with_numpy(torch_fn, reference_fn, x, exact_dtype=False)
+        self.compare_with_numpy(torch_fn, reference_fn, y, exact_dtype=False)
+        self.compare_with_numpy(torch_fn, reference_fn, z, exact_dtype=False)
+        out_variant_helper(torch.xlogy, 3.14, x)
+        out_variant_helper(torch.xlogy, 3.14, y)
+        out_variant_helper(torch.xlogy, 3.14, z)
+
+        # Special Values Tensor-Tensor
+        t = torch.tensor([0., 1., 2., float('inf'), -float('inf'), float('nan')], device=device)
+        zeros = torch.zeros(6, dtype=y_dtype, device=device)
+
+        torch_fn = partial(torch.xlogy, zeros)
+        reference_fn = partial(scipy.special.xlogy, zeros.cpu().numpy())
+        self.compare_with_numpy(torch_fn, reference_fn, t, exact_dtype=False)
+        out_variant_helper(torch.xlogy, zeros, t)
+        inplace_variant_helper(zeros, t)
+
+        # Special Values Scalar-Tensor
+        torch_fn = partial(torch.xlogy, 0)
+        reference_fn = partial(scipy.special.xlogy, 0)
+        self.compare_with_numpy(torch_fn, reference_fn, t, exact_dtype=False)
+        out_variant_helper(torch.xlogy, 0, t)
+
+    @skipIf(not TEST_SCIPY, "Scipy required for the test.")
+    def test_xlogy_bfloat16(self, device):
+        def _compare_helper(x, y):
+            x_np = x if isinstance(x, float) else x.cpu().to(torch.float).numpy()
+            y_np = y if isinstance(y, float) else y.cpu().to(torch.float).numpy()
+            expected = torch.from_numpy(scipy.special.xlogy(x_np, y_np))
+            actual = torch.xlogy(x, y)
+            self.assertEqual(expected, actual, exact_dtype=False)
+
+        x_dtype, y_dtype = torch.bfloat16, torch.bfloat16
+
+        # Tensor-Tensor Test (tensor of same and different shape)
+        x = make_tensor((3, 2, 4, 5), device, x_dtype, low=0.5, high=1000)
+        y = make_tensor((3, 2, 4, 5), device, y_dtype, low=0.5, high=1000)
+        z = make_tensor((4, 5), device, y_dtype, low=0.5, high=1000)
+
+        _compare_helper(x, x)
+        _compare_helper(x, y)
+        _compare_helper(x, z)
+
+        _compare_helper(x, 3.14)
+        _compare_helper(y, 3.14)
+        _compare_helper(z, 3.14)
+
+        # Special Values Tensor-Tensor
+        t = torch.tensor([0., 1., 2., float('inf'), -float('inf'), float('nan')], device=device)
+        zeros = torch.tensor(5, dtype=y_dtype, device=device)
+        _compare_helper(t, zeros)
+        _compare_helper(t, 0.)
 
 tensor_binary_ops = [
     '__lt__', '__le__',
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 7a619b926612..9f68622e7691 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -647,6 +647,16 @@
   self: grad / (1 + pow(2, other - self))
   other: grad / (1 + pow(2, self - other))
 
+- name: xlogy.Tensor(Tensor self, Tensor other) -> Tensor
+  self: grad * at::xlogy((self != 0), other)
+  other: grad * self / other
+
+- name: xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor
+  other: grad * self / other
+
+- name: xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor
+  self: grad * at::xlogy((self != 0), other)
+
 - name: logdet(Tensor self) -> Tensor
   self: logdet_backward(grad, self, result)
 
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index f081b595de2f..e9443202785d 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -4472,6 +4472,20 @@ def callable(a, b) -> number
 Out-of-place version of :meth:`torch.Tensor.masked_scatter_`
 """)
 
+add_docstr_all('xlogy',
+               r"""
+xlogy(other) -> Tensor
+
+See :func:`torch.xlogy`
+""")
+
+add_docstr_all('xlogy_',
+               r"""
+xlogy_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.xlogy`
+""")
+
 add_docstr_all('masked_fill',
                r"""
 masked_fill(mask, value) -> Tensor
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 91da41bf05d4..029494284f39 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -4371,6 +4371,48 @@ def merge_dicts(*dicts):
     {out}
 """.format(**common_args))
 
+add_docstr(torch.xlogy,
+           r"""
+xlogy(input, other, *, out=None) -> Tensor
+
+Computes ``input * log(other)`` with the following cases.
+
+.. math::
+    \text{out}_{i} = \begin{cases}
+        \text{NaN} & \text{if } \text{other}_{i} = \text{NaN} \\
+        0 & \text{if } \text{input}_{i} = 0.0 \\
+        \text{input}_{i} * \log{(\text{other}_{i})} & \text{otherwise}
+    \end{cases}
+
+Similar to SciPy's `scipy.special.xlogy`.
+
+""" + r"""
+
+Args:
+    input (Number or Tensor)
+    other (Number or Tensor)
+
+.. note:: At least one of :attr:`input` or :attr:`other` must be a tensor.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> x = torch.zeros(5,)
+    >>> y = torch.tensor([-1, 0, 1, float('inf'), float('nan')])
+    >>> torch.xlogy(x, y)
+    tensor([0., 0., 0., 0., nan])
+    >>> x = torch.tensor([1, 2, 3])
+    >>> y = torch.tensor([3, 2, 1])
+    >>> torch.xlogy(x, y)
+    tensor([1.0986, 1.3863, 0.0000])
+    >>> torch.xlogy(x, 4)
+    tensor([1.3863, 2.7726, 4.1589])
+    >>> torch.xlogy(2, y)
+    tensor([2.1972, 1.3863, 0.0000])
+""".format(**common_args))
+
 add_docstr(torch.logical_and,
            r"""
 logical_and(input, other, *, out=None) -> Tensor
diff --git a/torch/overrides.py b/torch/overrides.py
index c0e34634fd67..d23e34831bdd 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -495,6 +495,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.logaddexp: lambda input, other, out=None: -1,
         torch.logaddexp2: lambda input, other, out=None: -1,
         torch.logdet: lambda input: -1,
+        torch.xlogy: lambda x, y: -1,
         torch.logical_and: lambda input, other, out=None: -1,
         torch.logical_not: lambda input, out=None: -1,
         torch.logical_or: lambda input, other, out=None: -1,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index ba29c42f39ff..55b97b38a4da 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -291,12 +291,21 @@ def sample_inputs_addmm(op_info, device, dtype, requires_grad):
     return (SampleInput((make_tensor((S, S), device, dtype,
                                      low=None, high=None,
                                      requires_grad=requires_grad),
-                        make_tensor((S, S), device, dtype,
-                                    low=None, high=None,
-                                    requires_grad=requires_grad),
-                        make_tensor((S, S), device, dtype,
-                                    low=None, high=None,
-                                    requires_grad=False))),)
+                         make_tensor((S, S), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                         make_tensor((S, S), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=False))),)
+
+
+def sample_inputs_xlogy(self, device, dtype, requires_grad):
+    return (SampleInput((make_tensor((S, S), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                         make_tensor((S, S), device, dtype,
+                                     low=0, high=None,
+                                     requires_grad=requires_grad))),)
 
 def np_sinc_with_fp16_as_fp32(x):
     # Wraps numpy's sinc function so that fp16 values are promoted to fp32
@@ -1084,6 +1093,14 @@ def reference_sigmoid(x):
                                     dtypes=[torch.bfloat16]),),
                        assert_autodiffed=True,
                        promotes_integers_to_float=True),
+        OpInfo('xlogy',
+               dtypes=all_types_and(torch.bool),
+               dtypesIfCPU=all_types_and(torch.bool, torch.half, torch.bfloat16),
+               dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
+               test_inplace_grad=True,
+               supports_tensor_out=True,
+               promotes_integers_to_float=True,
+               sample_inputs_func=sample_inputs_xlogy),
     ]
     op_db = op_db + op_db_scipy_reference
 

From 67d0c1824178c45430ea6c81137f84c276d3b20b Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Tue, 22 Dec 2020 15:18:16 -0800
Subject: [PATCH 12/45] [FX] Try to make it more clear that _update_args_kwargs
 should not be called (#49745)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49745

Test Plan: Imported from OSS

Reviewed By: zdevito

Differential Revision: D25682177

Pulled By: jamesr66a

fbshipit-source-id: 4910577541c4d41e1be50a7aa061873f061825b6
---
 test/test_fx.py  |  6 ++++++
 torch/fx/node.py | 15 +++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index ac893b75a611..5e285039a6dd 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -1166,6 +1166,12 @@ def forward(self, x):
         input = torch.rand(3, 4)
         self.assertEqual(traced(input), MyNamedTup(input, input))
 
+    def test_update_args_kwargs_yells_at_you(self):
+        symtraced = symbolic_trace(SimpleTest())
+        node = next(iter(symtraced.graph.nodes))
+        with self.assertRaisesRegex(AttributeError, '__update_args_kwargs'):
+            node.__update_args_kwargs((), {})
+
     def test_torchbind_class_attribute_in_fx(self):
         if TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS:
             self.skipTest("torch.classes._TorchScriptTesting._StackString is registered, skipping")
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 5e33a2a6d0da..629c8ca98957 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -63,7 +63,7 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: 'Target',
         # The public API for this is `all_input_nodes`, this private attribute
         # should not be accessed directly.
         self._input_nodes : Dict[Node, None] = {}
-        self._update_args_kwargs(map_arg(args, lambda x: x), map_arg(kwargs, lambda x: x))  # type: ignore
+        self.__update_args_kwargs(map_arg(args, lambda x: x), map_arg(kwargs, lambda x: x))  # type: ignore
 
         # All of the nodes that use the value produced by this Node
         # Note one user may correspond to several uses, e.g. the node fo ``x + x``
@@ -159,7 +159,9 @@ def args(self, a : Tuple[Argument, ...]):
         depends on the node's opcode. See the ``fx.Graph`` docstring for more
         information.
         """
-        self._update_args_kwargs(map_arg(a, lambda x: x), self._kwargs)  # type: ignore
+        # DO NOT CALL `__update_args_kwargs` directly. The correct way to
+        # set `args` is via direct assignment, i.e. `node.args = new_args`
+        self.__update_args_kwargs(map_arg(a, lambda x: x), self._kwargs)  # type: ignore
 
     @property
     def kwargs(self) -> Dict[str, Argument]:
@@ -180,7 +182,9 @@ def kwargs(self, k : Dict[str, Argument]):
         depends on the node's opcode. See the ``fx.Graph`` docstring for more
         information.
         """
-        self._update_args_kwargs(self._args, map_arg(k, lambda x: x))  # type: ignore
+        # DO NOT CALL `__update_args_kwargs` directly. The correct way to
+        # set `args` is via direct assignment, i.e. `node.kwargs = new_kwargs`
+        self.__update_args_kwargs(self._args, map_arg(k, lambda x: x))  # type: ignore
 
     @property
     def all_input_nodes(self) -> List['Node']:
@@ -196,7 +200,7 @@ def all_input_nodes(self) -> List['Node']:
         """
         return list(self._input_nodes.keys())
 
-    def _update_args_kwargs(self, new_args : Tuple[Argument, ...], new_kwargs : Dict[str, Argument]):
+    def __update_args_kwargs(self, new_args : Tuple['Argument', ...], new_kwargs : Dict[str, 'Argument']):
         """
         This API is internal. Do *not* call it directly.
         """
@@ -240,12 +244,11 @@ def maybe_replace_node(n : Node) -> Node:
             new_kwargs = map_arg(use_node.kwargs, maybe_replace_node)
             assert isinstance(new_args, tuple)
             assert isinstance(new_kwargs, dict)
-            use_node._update_args_kwargs(new_args, new_kwargs)
+            use_node.__update_args_kwargs(new_args, new_kwargs)
 
         assert len(self.users) == 0
         return to_process
 
-
 def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument:
     """ Apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """
     if isinstance(a, tuple):

From b414123264cacca98a5ff8c91b2efe2d0f0b4b9f Mon Sep 17 00:00:00 2001
From: Tyler Davis <dvisnty@amazon.com>
Date: Tue, 22 Dec 2020 15:52:38 -0800
Subject: [PATCH 13/45] Update `is_floating_point()` docs to mention bfloat16
 (#49611)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/49610 . Explicitly mentions that `is_floating_point()` will return `True` if passed a `bfloat16` tensor.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49611

Reviewed By: mrshenli

Differential Revision: D25660723

Pulled By: VitalyFedyunin

fbshipit-source-id: 04fab2f6c1c5c2859c6efff1976a92a676b9efa3
---
 torch/_torch_docs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 029494284f39..ae0ffd916e51 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3865,7 +3865,7 @@ def merge_dicts(*dicts):
 is_floating_point(input) -> (bool)
 
 Returns True if the data type of :attr:`input` is a floating point data type i.e.,
-one of ``torch.float64``, ``torch.float32`` and ``torch.float16``.
+one of ``torch.float64``, ``torch.float32``, ``torch.float16``, and ``torch.bfloat16``.
 
 Args:
     {input}

From c3a7591cef525aa46df8176fba01d15d37d65828 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Tue, 22 Dec 2020 16:47:34 -0800
Subject: [PATCH 14/45] fx quant: do not observe bias on F.conv (#49623)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49623

(not ready for review)

Ensures that conv bias is not observed in a `F.conv{n}d` call.

Test Plan: Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25652856

fbshipit-source-id: 884f87be1948d3e049a557d79bec3c90aec34340
---
 test/quantization/test_quantize_fx.py         | 26 ++++++++--
 torch/quantization/fx/quantize.py             | 47 +++++++++++++++----
 .../testing/_internal/common_quantization.py  | 12 +++--
 3 files changed, 68 insertions(+), 17 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 98283e713747..14d66a9a119c 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -1346,12 +1346,12 @@ def forward(self, x):
                 self.checkGraphModeFxOp(model, data, quant_type, quantized_node)
 
     @skipIfNoFBGEMM
-    def test_quantized_conv(self):
+    def test_conv_module(self):
         conv_module = {1 : torch.nn.Conv1d, 2 : torch.nn.Conv2d, 3 : torch.nn.Conv3d}
 
-        class Conv(torch.nn.Module):
+        class ConvWrapper(torch.nn.Module):
             def __init__(self, dim):
-                super(Conv, self).__init__()
+                super(ConvWrapper, self).__init__()
                 self.conv = conv_module[dim](3, 3, 3).float()
 
             def forward(self, x):
@@ -1366,9 +1366,27 @@ def forward(self, x):
         }
         for dim, quant_type in options:
             model = self.checkGraphModeFxOp(
-                Conv(dim), self.img_data_dict[dim], quant_type,
+                ConvWrapper(dim), self.img_data_dict[dim], quant_type,
                 quantized_nodes[dim])
 
+    @skipIfNoFBGEMM
+    def test_conv2d_functional(self):
+        for bias in [True, False]:
+            conv = torch.nn.Conv2d(1, 1, 1, bias=bias)
+            # There should be 3 observers: after input, weight and activation.
+            # No observer after bias.
+            prepare_expected_node_occurrence = {
+                ns.call_module(torch.quantization.HistogramObserver): 2,
+                ns.call_module(torch.quantization.PerChannelMinMaxObserver): 1,
+            }
+            expected_node_occurrence = \
+                {ns.call_function(torch.ops.quantized.conv2d): 1}
+            self.checkGraphModeFxOp(
+                conv, (torch.randn(4, 1, 4, 4),), QuantType.STATIC,
+                prepare_expected_node_occurrence=prepare_expected_node_occurrence,
+                expected_node_occurrence=expected_node_occurrence,
+            )
+
     @skipIfNoFBGEMM
     def test_quantized_conv_relu(self):
         """tests for conv1d_relu/conv2d_relu/conv3d_relu"""
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 3d4a92323067..363191488839 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -234,10 +234,38 @@ def insert_observer_for_input_arg_of_observed_node(
 
 # A dictionary for querying the weight index for a given op
 WEIGHT_INDEX_DICT = {
+    torch.nn.functional.conv1d : [1],
     torch.nn.functional.conv2d : [1],
+    torch.nn.functional.conv3d : [1],
     torch.nn.functional.linear : [1],
 }
 
+def node_arg_is_weight(node: Node, arg: Any) -> bool:
+    if isinstance(node, Node) and node.op == 'call_function' and \
+            node.target in WEIGHT_INDEX_DICT:
+        for i, node_arg in enumerate(node.args):
+            if arg is node_arg and i in \
+                    WEIGHT_INDEX_DICT[node.target]:  # type: ignore
+                return True
+    return False
+
+# A dictionary for querying the weight index for a given op
+# TODO(future PR): handle linear
+BIAS_INDEX_DICT = {
+    torch.nn.functional.conv1d : [2],
+    torch.nn.functional.conv2d : [2],
+    torch.nn.functional.conv3d : [2],
+}
+
+def node_arg_is_bias(node: Node, arg: Any) -> bool:
+    if isinstance(node, Node) and node.op == 'call_function' and \
+            node.target in BIAS_INDEX_DICT:
+        for i, node_arg in enumerate(node.args):
+            if arg is node_arg and i in \
+                    BIAS_INDEX_DICT[node.target]:  # type: ignore
+                return True
+    return False
+
 # weight prepacking ops
 WEIGHT_PREPACK_OPS = {
     torch._ops.ops.quantized.linear_prepack,
@@ -956,15 +984,16 @@ def _find_quants(self, graph: Graph, matches: Dict[str, MatchResult],
 
         def visit(node, matched_pattern, qconfig):
             def visit_arg(arg):
-                is_weight = False
-                if isinstance(node, Node) and node.op == 'call_function' and \
-                        node.target in WEIGHT_INDEX_DICT:
-                    for i, node_arg in enumerate(node.args):
-                        if arg is node_arg and i in \
-                                WEIGHT_INDEX_DICT[node.target]:  # type: ignore
-                            is_weight = True
-                if qconfig is not None and \
-                   (activation_is_statically_quantized(qconfig) or is_weight):
+                is_weight = node_arg_is_weight(node, arg)
+                is_bias = node_arg_is_bias(node, arg)
+                is_activation = not (is_weight or is_bias)
+                should_add_handler = qconfig is not None and (
+                    (is_activation and
+                        activation_is_statically_quantized(qconfig)) or
+                    (is_weight and weight_is_statically_quantized(qconfig))
+                )
+
+                if should_add_handler:
                     act_post_process_ctr = qconfig.weight if is_weight else \
                         qconfig.activation
                     # overwrite the constructor from qconfig
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index e05425eb67a2..eef9381d79d9 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -672,6 +672,13 @@ def checkGraphModeFxOp(self, model, inputs, quant_type,
             if not quant_type == QuantType.DYNAMIC:
                 prepared(*inputs)
 
+            if print_debug_info:
+                print()
+                print('quant type:\n', quant_type)
+                print('original model:\n', model)
+                print()
+                print('prepared model:\n', prepared)
+
             self.checkGraphModuleNodes(
                 prepared, prepare_expected_node,
                 prepare_expected_node_occurrence, prepare_expected_node_list)
@@ -685,10 +692,7 @@ def checkGraphModeFxOp(self, model, inputs, quant_type,
             qgraph_to_check = qgraph_debug if debug else qgraph
             if print_debug_info:
                 print()
-                print('quant type:', quant_type)
-                print('original model:', model)
-                print()
-                print('quantized model:', qgraph_to_check)
+                print('quantized model:\n', qgraph_to_check)
                 self.printGraphModule(qgraph_to_check)
                 print()
             self.checkGraphModuleNodes(

From 19f972b6964eb59bb1a6c09d4bafe015b86fc45c Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Tue, 22 Dec 2020 16:47:34 -0800
Subject: [PATCH 15/45] fx quant: do not observe bias on F.linear (#49628)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49628

Ensures that linear bias is not observed in a `F.linear` call. This should
be a small speedup in PTQ, and will change numerics (in a good way) for
QAT if someone is using `F.linear`.

Note: the implementation is slightly more verbose compared to conv
because bias is a keyword argument in Linear.

Test Plan:
```
python test/test_quantization.py TestQuantizeFxOps.test_linear_functional_bias_not_observed
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25653532

fbshipit-source-id: c93501bf6b55cbe4a11cfdad6f79313483133a39
---
 test/quantization/test_quantize_fx.py | 14 ++++++++++++++
 torch/quantization/fx/quantize.py     | 26 ++++++++++++++------------
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 14d66a9a119c..7b7b5ffb83a0 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -1387,6 +1387,20 @@ def test_conv2d_functional(self):
                 expected_node_occurrence=expected_node_occurrence,
             )
 
+    def test_linear_functional_bias_not_observed(self):
+        data = (torch.rand((1, 4), dtype=torch.float),)
+        for bias in [True, False]:
+            linear = torch.nn.Linear(4, 4, bias=bias)
+            # There should be 3 observers: after input, weight and activation.
+            expected_node_occurrence = {
+                ns.call_module(torch.quantization.HistogramObserver): 2,
+                ns.call_module(torch.quantization.PerChannelMinMaxObserver): 1,
+            }
+            self.checkGraphModeFxOp(
+                linear, data, QuantType.STATIC,
+                prepare_expected_node_occurrence=expected_node_occurrence,
+            )
+
     @skipIfNoFBGEMM
     def test_quantized_conv_relu(self):
         """tests for conv1d_relu/conv2d_relu/conv3d_relu"""
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 363191488839..2cdd7b59b314 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -249,21 +249,23 @@ def node_arg_is_weight(node: Node, arg: Any) -> bool:
                 return True
     return False
 
-# A dictionary for querying the weight index for a given op
-# TODO(future PR): handle linear
-BIAS_INDEX_DICT = {
-    torch.nn.functional.conv1d : [2],
-    torch.nn.functional.conv2d : [2],
-    torch.nn.functional.conv3d : [2],
+CONV_OPS_WITH_BIAS = {
+    torch.nn.functional.conv1d,
+    torch.nn.functional.conv2d,
+    torch.nn.functional.conv3d,
 }
+CONV_BIAS_ARG_INDEX = 2
 
 def node_arg_is_bias(node: Node, arg: Any) -> bool:
-    if isinstance(node, Node) and node.op == 'call_function' and \
-            node.target in BIAS_INDEX_DICT:
-        for i, node_arg in enumerate(node.args):
-            if arg is node_arg and i in \
-                    BIAS_INDEX_DICT[node.target]:  # type: ignore
-                return True
+    if isinstance(node, Node) and node.op == 'call_function':
+        if node.target in CONV_OPS_WITH_BIAS:
+            for i, node_arg in enumerate(node.args):
+                if arg is node_arg and i == CONV_BIAS_ARG_INDEX:
+                    return True
+        elif node.target is torch.nn.functional.linear:
+            for kwarg_name, kwarg_value in node.kwargs.items():
+                if kwarg_name == 'bias' and arg is kwarg_value:
+                    return True
     return False
 
 # weight prepacking ops

From de07d07600881af7dff7718c7f773ebc12bed696 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Tue, 22 Dec 2020 16:47:34 -0800
Subject: [PATCH 16/45] fx quant: improve types on convert (#49688)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49688

Adds more types on FX quantize convert, fixing things as they
are uncovered by mypy.

Test Plan:
```
mypy torch/quantization
python test/test_quantization.py TestQuantizeFx
python test/test_quantization.py TestQuantizeFxOps
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25667231

fbshipit-source-id: 262713c6ccb050a05e3119c0457d0335dde82d25
---
 torch/quantization/fx/quantize.py | 55 ++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 2cdd7b59b314..c57b2c02aa86 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -10,6 +10,8 @@
     Node,
 )
 
+from torch.fx.node import Argument
+
 from torch.quantization import (
     propagate_qconfig_,
     convert,
@@ -583,15 +585,15 @@ def _convert(self, model: GraphModule, debug: bool = False,
             self._find_quants(model.graph, matches)
 
         self.quantized_graph = Graph()
-        env: Dict[Any, Any] = {}
-        quant_env: Dict[Any, Any] = {}
+        env: Dict[str, Node] = {}
+        quant_env: Dict[str, Node] = {}
 
-        graph_inputs = []
+        graph_inputs: List[str] = []
         for node in model.graph.nodes:
             if node.op == 'placeholder':
                 graph_inputs.append(node.name)
 
-        def load_non_quantized(n):
+        def load_non_quantized(n: Node) -> Node:
             if n.name not in env:
                 assert n.name in quant_env, \
                     'trying to load float node but did not find ' + \
@@ -601,13 +603,13 @@ def load_non_quantized(n):
                 env[n.name] = Proxy(quant_env[n.name]).dequantize().node
             return env[n.name]
 
-        def load_quantized(n):
+        def load_quantized(n: Node) -> Node:
             assert n.name in quant_env, \
                 'trying to load quantized node but did not find node:' + \
                 n.name + ' in quant environment:' + str(quant_env)
             return quant_env[n.name]
 
-        def load_x(n):
+        def load_x(n: Node) -> Node:
             assert n.name in env or n.name in quant_env, \
                 'node ' + n.name + ' does not exist in either environment'
             if n.name in quant_env:
@@ -615,7 +617,8 @@ def load_x(n):
             else:
                 return env[n.name]
 
-        def load_arg(quantized):
+        def load_arg(quantized: Optional[Union[List[Any], bool, Tuple[Any, ...]]]
+                     ) -> Callable[[Node], Argument]:
             """
             Input: quantized, which can be None, list, boolean or tuple
               - if quantized is a list or tuple, then arg should be a list and
@@ -650,18 +653,20 @@ def load_arg_impl(arg_or_args):
                     return type(arg_or_args)(loaded_args)
             return load_arg_impl
 
-        def is_quantized(node):
-            if isinstance(node, Node):
-                assert node.name in env or node.name in quant_env, \
-                    'Expecting node to be in the environment'
+        def node_arg_is_quantized(node_arg: Any) -> bool:
+            if isinstance(node_arg, Node):
+                assert node_arg.name in env or node_arg.name in quant_env, \
+                    'Expecting node_arg to be in the environment'
                 # there might be nodes appearing in both environemnts, but
                 # quant_env will take precedence
-                if node.name in quant_env:
+                if node_arg.name in quant_env:
                     return True
-                elif node.name in env:
+                elif node_arg.name in env:
                     return False
-            elif isinstance(node, list):
-                quantized = map(is_quantized, node)
+                else:
+                    return False
+            elif isinstance(node_arg, list):
+                quantized = map(node_arg_is_quantized, node_arg)
                 if all(quantized):
                     return True
                 elif not any(quantized):
@@ -669,8 +674,10 @@ def is_quantized(node):
                 else:
                     raise Exception(
                         "partially quantized inputs in list not handled yet")
+            else:
+                return False
 
-        def is_output_quantized(node) -> bool:
+        def is_output_quantized(node: Node, obj: QuantizeHandler) -> bool:
             """ Check if output node is quantized or not """
             assert self.modules is not None
             # by default the output is expected to be quantized
@@ -687,7 +694,7 @@ def is_output_quantized(node) -> bool:
                     'call_function',
                     'call_method'], \
                     'CopyNode of type ' + node.op + ' is not handled'
-                quantized = is_quantized(node.args[0])
+                quantized = node_arg_is_quantized(node.args[0])
 
             if not activation_is_statically_quantized(qconfig) or \
                not input_output_observed(obj):
@@ -695,10 +702,11 @@ def is_output_quantized(node) -> bool:
 
             return quantized
 
-        def insert_quantize_node(node):
+        def insert_quantize_node(node: Node) -> None:
             """ Given a activation_post_process module call node, insert a
             quantize node"""
             assert self.modules is not None
+            assert isinstance(node.target, str)
             observer_module = self.modules[node.target]
             prev_node = node.args[0]
             if observer_module.dtype == torch.float16:
@@ -710,13 +718,14 @@ def insert_quantize_node(node):
                 # later in a separate pass
                 env[node.name] = self.quantized_graph.node_copy(
                     node, load_non_quantized)
-            elif prev_node.name in quant_env:
+            elif isinstance(prev_node, Node) and prev_node.name in quant_env:
                 # if previous node is already quantized, we'll just remove the
                 # activation_post_process
                 quant_env[node.name] = quant_env[prev_node.name]
             else:
                 # replace activation post process with quantization ops
                 root_module = self.modules[""]
+                assert isinstance(node.args[0], Node)
                 quant_env[node.name] = quantize_node(
                     root_module, self.quantized_graph,
                     load_non_quantized(node.args[0]), observer_module)
@@ -762,7 +771,7 @@ def insert_quantize_node(node):
                     if is_standalone_module_node:
                         quantized = False
                     else:
-                        quantized = is_output_quantized(node)
+                        quantized = is_output_quantized(node, obj)
 
                 if quantized:
                     quant_env[node.name] = result
@@ -794,12 +803,12 @@ def insert_quantize_node(node):
         act_post_process_removed_graph = Graph()
         env = {}
 
-        def load_arg(a):  # type: ignore
+        def load_arg_simple(a: Argument) -> Argument:
             return map_arg(a, lambda node: env[node.name])
         for node in self.quantized_graph.nodes:
             if node.op == 'output':
                 act_post_process_removed_graph.output(
-                    map_arg(node.args[0], load_arg))
+                    map_arg(node.args[0], load_arg_simple))
                 continue
             if node.op == 'call_module' and \
                is_activation_post_process(self.modules[node.target]):
@@ -807,7 +816,7 @@ def load_arg(a):  # type: ignore
                 env[node.name] = env[node.args[0].name]
             else:
                 env[node.name] = act_post_process_removed_graph.node_copy(
-                    node, load_arg)
+                    node, load_arg_simple)
 
         # removes qconfig and activation_post_process modules
         _remove_qconfig(model)

From 27f0dd36d9e55ecb77ac492403e58cdb295d48bc Mon Sep 17 00:00:00 2001
From: Guilherme Leobas <gleobas@quansight.com>
Date: Tue, 22 Dec 2020 16:52:39 -0800
Subject: [PATCH 17/45] add type annotations to torch.nn.parallel._functions
 (#49687)

Summary:
Closes gh-49686

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49687

Reviewed By: ngimel

Differential Revision: D25680210

Pulled By: zou3519

fbshipit-source-id: 221f7c9a4d3a6213eac6983030b0be51ee1c5b60
---
 mypy.ini                        |  3 ---
 torch/nn/parallel/_functions.py | 11 ++++++-----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index d5b1ed20e081..8c900bcced76 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -86,9 +86,6 @@ ignore_errors = True
 [mypy-torch.nn.modules.pooling]
 ignore_errors = True
 
-[mypy-torch.nn.parallel._functions]
-ignore_errors = True
-
 [mypy-torch.nn.qat.modules.activations]
 ignore_errors = True
 
diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py
index 1ab8b467b90b..dd42d9a05dfb 100644
--- a/torch/nn/parallel/_functions.py
+++ b/torch/nn/parallel/_functions.py
@@ -4,6 +4,7 @@
 from . import comm
 from torch.autograd import Function
 from torch._utils import _get_device_index
+from typing import List, Optional
 
 
 class Broadcast(Function):
@@ -39,9 +40,9 @@ class ReduceAddCoalesced(Function):
     def forward(ctx, destination, num_inputs, *grads):
         ctx.target_gpus = [grads[i].get_device() for i in range(0, len(grads), num_inputs)]
 
-        grads = [grads[i:i + num_inputs]
-                 for i in range(0, len(grads), num_inputs)]
-        return comm.reduce_add_coalesced(grads, destination)
+        grads_ = [grads[i:i + num_inputs]
+                  for i in range(0, len(grads), num_inputs)]
+        return comm.reduce_add_coalesced(grads_, destination)
 
     @staticmethod
     def backward(ctx, *grad_outputs):
@@ -105,10 +106,10 @@ def backward(ctx, *grad_output):
 
 
 # background streams used for copying
-_streams = None
+_streams: Optional[List[Optional[torch.cuda.Stream]]] = None
 
 
-def _get_stream(device):
+def _get_stream(device: int):
     """Gets a background stream for copying between CPU and GPU"""
     global _streams
     if device == -1:

From 62f9b03b7c8170d1f6d5737b52df07e76a3de10d Mon Sep 17 00:00:00 2001
From: Riley Dulin <dulinr@fb.com>
Date: Tue, 22 Dec 2020 16:58:09 -0800
Subject: [PATCH 18/45] [lint] Apply whitespace linter to all gradle files

Summary: Run whitespace and license linters on gradle build files.

Reviewed By: zertosh

Differential Revision: D25687355

fbshipit-source-id: 44330daac7582fed6c05680bffc74e855a9b1dbc
---
 android/gradle/android_tasks.gradle       | 1 -
 android/pytorch_android/host/build.gradle | 1 -
 android/settings.gradle                   | 1 -
 3 files changed, 3 deletions(-)

diff --git a/android/gradle/android_tasks.gradle b/android/gradle/android_tasks.gradle
index ca188ac72d07..0d5932559e47 100644
--- a/android/gradle/android_tasks.gradle
+++ b/android/gradle/android_tasks.gradle
@@ -1,4 +1,3 @@
-
 import java.nio.file.Files
 import java.nio.file.Paths
 import java.io.FileOutputStream
diff --git a/android/pytorch_android/host/build.gradle b/android/pytorch_android/host/build.gradle
index a808ae882ce4..fe30660929b9 100644
--- a/android/pytorch_android/host/build.gradle
+++ b/android/pytorch_android/host/build.gradle
@@ -38,4 +38,3 @@ dependencies {
 }
 
 apply from: rootProject.file('gradle/release.gradle')
-
diff --git a/android/settings.gradle b/android/settings.gradle
index 09473fa34281..743f388b6507 100644
--- a/android/settings.gradle
+++ b/android/settings.gradle
@@ -4,4 +4,3 @@ project(':pytorch_android_torchvision').projectDir = file('pytorch_android_torch
 
 project(':pytorch_host').projectDir = file('pytorch_android/host')
 project(':test_app').projectDir = file('test_app/app')
-

From 010b9c52f41ed95aa7408bd91a2bd9d1e6703422 Mon Sep 17 00:00:00 2001
From: "Will Feng (DPER)" <willfeng@fb.com>
Date: Tue, 22 Dec 2020 17:42:48 -0800
Subject: [PATCH 19/45] Skip None submodule during JIT-tracing (#49765)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49765

Some PyTorch module can have None as submodule, which causes the following error in JIT-tracing:

Repro script:
```
import torch

class TestModule(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.submod = torch.nn.Linear(3, 4)
    self.submod = None

  def forward(self, inputs):
    return inputs

m = TestModule()
tm = torch.jit.trace(m, torch.tensor(1.))
```
Error:
```
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/data/miniconda3/envs/master_nightly/lib/python3.7/site-packages/torch/jit/_trace.py", line 742, in trace
    _module_class,
  File "/data/miniconda3/envs/master_nightly/lib/python3.7/site-packages/torch/jit/_trace.py", line 928, in trace_module
    module = make_module(mod, _module_class, _compilation_unit)
  File "/data/miniconda3/envs/master_nightly/lib/python3.7/site-packages/torch/jit/_trace.py", line 560, in make_module
    return _module_class(mod, _compilation_unit=_compilation_unit)
  File "/data/miniconda3/envs/master_nightly/lib/python3.7/site-packages/torch/jit/_trace.py", line 1039, in __init__
    submodule, TracedModule, _compilation_unit=None
  File "/data/miniconda3/envs/master_nightly/lib/python3.7/site-packages/torch/jit/_trace.py", line 560, in make_module
    return _module_class(mod, _compilation_unit=_compilation_unit)
  File "/data/miniconda3/envs/master_nightly/lib/python3.7/site-packages/torch/jit/_trace.py", line 988, in __init__
    assert isinstance(orig, torch.nn.Module)
AssertionError
```

This pull request changes the JIT-tracing logic to skip the None submodule when tracing.

Test Plan: `buck test mode/dev //caffe2/test:jit -- test_trace_skip_none_submodule`

Reviewed By: wanchaol

Differential Revision: D25670948

fbshipit-source-id: 468f42f5ddbb8fd3de06d0bc224dc67bd7172358
---
 test/jit/test_tracer.py | 14 ++++++++++++++
 torch/jit/_trace.py     |  2 ++
 2 files changed, 16 insertions(+)

diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 291461a8030b..059f59ff8702 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -1846,6 +1846,20 @@ def f(x):
         with self.assertRaisesRegex(RuntimeError, r"Type 'Tuple\[int\]' cannot be traced"):
             torch.jit.trace(f, (1,))
 
+    def test_trace_skip_none_submodule(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.submod = torch.nn.Linear(3, 4)
+                self.submod = None
+
+            def forward(self, inputs):
+                return inputs
+
+        m = TestModule()
+        tm = torch.jit.trace(m, torch.tensor(1.))
+        self.assertFalse(hasattr(tm, "submod"))
+
 
 class TestMixTracingScripting(JitTestCase):
     def test_trace_script(self):
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index b9120f52379e..c424d23e050a 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -1035,6 +1035,8 @@ def check_unique(param):
             )
 
         for name, submodule in orig._modules.items():
+            if submodule is None:
+                continue
             tmp_module._modules[name] = make_module(
                 submodule, TracedModule, _compilation_unit=None
             )

From abacf2703893000cdaf7d9363264599ad137dd5b Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Tue, 22 Dec 2020 17:54:54 -0800
Subject: [PATCH 20/45] Revert D25623219: [pytorch][PR] early terminate when
 CUDA assert were thrown

Test Plan: revert-hammer

Differential Revision:
D25623219 (https://github.com/pytorch/pytorch/commit/be091600eddada5c47377b93db922263910840c7)

Original commit changeset: 1b414623ecce

fbshipit-source-id: ba304c57eea29d19550ac1e864ccfcd0cec68bec
---
 test/test_testing.py                          | 52 +------------------
 torch/testing/_internal/common_device_type.py | 14 -----
 2 files changed, 2 insertions(+), 64 deletions(-)

diff --git a/test/test_testing.py b/test/test_testing.py
index 9285166cb15e..b87345186cb3 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -3,9 +3,9 @@
 import math
 
 from torch.testing._internal.common_utils import \
-    (TestCase, make_tensor, run_tests, slowTest)
+    (TestCase, run_tests, make_tensor)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, onlyCUDA, onlyOnCPUAndCUDA, dtypes)
+    (instantiate_device_type_tests, onlyOnCPUAndCUDA, dtypes)
 
 # For testing TestCase methods and torch.testing functions
 class TestTesting(TestCase):
@@ -438,54 +438,6 @@ def test_assert_messages(self, device):
         self.assertEqual("no_user_msg", self._get_assert_msg(msg=None, debug_msg="no_user_msg"))
         self.assertEqual("debug_msg\nuser_msg", self._get_assert_msg(msg="user_msg", debug_msg="debug_msg"))
 
-    @onlyCUDA
-    @slowTest
-    def test_cuda_assert_should_stop_test_suite(self, device):
-        # This test is slow because it spawn another process to run another test suite.
-        import subprocess
-        import sys
-
-        problematic_test_script = """\
-#!/usr/bin/env python
-
-import torch
-
-from torch.testing._internal.common_utils import (TestCase, run_tests)
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
-
-# This test is added to ensure that test suite terminates early when
-# CUDA assert was thrown since all subsequent test will fail.
-# See: https://github.com/pytorch/pytorch/issues/49019
-# This test file should be invoked from test_testing.py
-class TestThatContainsCUDAAssertFailure(TestCase):
-
-    def test_throw_unrecoverable_cuda_exception(self, device):
-        x = torch.rand(10, device=device)
-        # cause unrecoverable CUDA exception, recoverable on CPU
-        y = x[torch.tensor([25])].cpu()
-
-    def test_trivial_passing_test_case_on_cpu_cuda(self, device):
-        x1 = torch.tensor([0., 1.], device=device)
-        x2 = torch.tensor([0., 1.], device='cpu')
-        self.assertEqual(x1, x2)
-
-instantiate_device_type_tests(
-    TestThatContainsCUDAAssertFailure,
-    globals(),
-    except_for=None
-)
-
-if __name__ == '__main__':
-    run_tests()
-"""
-
-        # Test running of cuda assert test suite should early terminate.
-        p = subprocess.run([sys.executable, '-c', problematic_test_script], capture_output=True, timeout=120)
-        # should capture CUDA error
-        self.assertIn('CUDA error: device-side assert triggered', p.stderr.decode('ascii'))
-        # should run only 3 tests - 2 CPUs and 1 CUDA (remaining CUDA test should skip)
-        self.assertIn('Ran 3 tests', p.stderr.decode('ascii'))
-
 instantiate_device_type_tests(TestTesting, globals())
 
 if __name__ == '__main__':
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 73185116a4f5..36f02eff0c0f 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -187,9 +187,6 @@ def _construct_test_name(test_name, op, device_type, dtype):
 class DeviceTypeTestBase(TestCase):
     device_type: str = 'generic_device_type'
 
-    # Flag to disable test suite early due to unrecoverable error such as CUDA error.
-    _stop_test_suite = False
-
     # Precision is a thread-local setting since it may be overridden per test
     _tls = threading.local()
     _tls.precision = TestCase._precision
@@ -274,11 +271,6 @@ def instantiated_test(self, name=name, test=test_fn, dtype=dtype, op=op):
                     self.precision = self._get_precision_override(test_fn, dtype)
                     args = (arg for arg in (device_arg, dtype, op) if arg is not None)
                     result = test_fn(self, *args)
-                except RuntimeError as rte:
-                    if 'CUDA error: device-side assert triggered' in rte.__repr__():
-                        self._stop_test_suite = True
-                    # raise the runtime error as is.
-                    raise rte
                 finally:
                     self.precision = guard_precision
 
@@ -321,12 +313,6 @@ def instantiated_test(self, name=name, test=test_fn, dtype=dtype, op=op):
             for dtype in dtypes:
                 instantiate_test_helper(cls, name, test=test, dtype=dtype, op=None)
 
-    def run(self, result=None):
-        super().run(result=result)
-        # Early terminate test if _stop_test_suite is set.
-        if self._stop_test_suite:
-            result.stop()
-
 
 class CPUTestBase(DeviceTypeTestBase):
     device_type = 'cpu'

From 46b83212d111ba0314e11cb7a308d63ae6532ec9 Mon Sep 17 00:00:00 2001
From: skyline75489 <skyline75489@outlook.com>
Date: Tue, 22 Dec 2020 18:04:51 -0800
Subject: [PATCH 21/45] Remove unused six code for Python 2/3 compatibility
 (#48077)

Summary:
This is basically a reborn version of https://github.com/pytorch/pytorch/issues/45254 .

Ref: https://github.com/pytorch/pytorch/issues/42919

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48077

Reviewed By: ngimel

Differential Revision: D25687042

Pulled By: bugra

fbshipit-source-id: 05f20a6f3c5212f73d0b1505b493b720e6cf74e5
---
 benchmarks/distributed/ddp/benchmark.py            | 4 ----
 benchmarks/distributed/ddp/diff.py                 | 4 ----
 caffe2/contrib/tensorboard/tensorboard_exporter.py | 3 +--
 caffe2/python/context.py                           | 4 ++--
 caffe2/python/experiment_util.py                   | 3 +--
 caffe2/python/hypothesis_test_util.py              | 5 ++---
 caffe2/python/layer_model_helper.py                | 9 ++++-----
 caffe2/python/layer_parameter_sharing_test.py      | 3 +--
 caffe2/python/layers/functional.py                 | 3 +--
 caffe2/python/layers/sampling_trainable_mixin.py   | 3 +--
 caffe2/python/layers/tags.py                       | 4 ++--
 caffe2/python/model_helper.py                      | 3 +--
 caffe2/python/modeling/initializers.py             | 4 +---
 caffe2/python/modeling/net_modifier.py             | 3 +--
 caffe2/python/onnx/tests/c2_ref_test.py            | 6 ++----
 caffe2/python/onnx/tests/conversion_test.py        | 7 +++----
 caffe2/python/operator_test/image_input_op_test.py | 4 ++--
 caffe2/python/operator_test/reshape_ops_test.py    | 1 -
 caffe2/python/operator_test/utility_ops_test.py    | 3 +--
 caffe2/python/parallel_workers.py                  | 3 +--
 caffe2/python/python_op_test.py                    | 5 ++---
 caffe2/python/rnn_cell.py                          | 3 +--
 torch/utils/tensorboard/_caffe2_graph.py           | 3 +--
 torch/utils/tensorboard/_convert_np.py             | 3 +--
 torch/utils/tensorboard/writer.py                  | 5 ++---
 25 files changed, 34 insertions(+), 64 deletions(-)

diff --git a/benchmarks/distributed/ddp/benchmark.py b/benchmarks/distributed/ddp/benchmark.py
index 4567749665f6..202ad3c5f56c 100644
--- a/benchmarks/distributed/ddp/benchmark.py
+++ b/benchmarks/distributed/ddp/benchmark.py
@@ -26,10 +26,6 @@
 import torchvision
 
 
-if not torch._six.PY3:
-    raise RuntimeError("DDP benchmark requires Python 3")
-
-
 def allgather_object(obj):
     buffer = io.BytesIO()
     torch.save(obj, buffer)
diff --git a/benchmarks/distributed/ddp/diff.py b/benchmarks/distributed/ddp/diff.py
index d2e096dbfaf2..dc984626888a 100644
--- a/benchmarks/distributed/ddp/diff.py
+++ b/benchmarks/distributed/ddp/diff.py
@@ -9,10 +9,6 @@
 import numpy as np
 
 
-if not torch._six.PY3:
-    raise RuntimeError("DDP benchmark requires Python 3")
-
-
 def load(path):
     with open(path, 'r') as f:
         return json.load(f)
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter.py b/caffe2/contrib/tensorboard/tensorboard_exporter.py
index ef12ce563cde..a9a1651a9b99 100644
--- a/caffe2/contrib/tensorboard/tensorboard_exporter.py
+++ b/caffe2/contrib/tensorboard/tensorboard_exporter.py
@@ -7,7 +7,6 @@
 import copy
 import logging
 import os
-import six
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
@@ -93,7 +92,7 @@ def _get_blob_names(ops):
 
 
 def _remap_keys(m, f):
-    m2 = {f(key): value for key, value in six.iteritems(m)}
+    m2 = {f(key): value for key, value in m.items()}
     m.clear()
     m.update(m2)
 
diff --git a/caffe2/python/context.py b/caffe2/python/context.py
index 503e4fcc4159..ce9b312855e6 100644
--- a/caffe2/python/context.py
+++ b/caffe2/python/context.py
@@ -3,7 +3,7 @@
 
 import inspect
 import threading
-import six
+import functools
 
 
 class _ContextInfo(object):
@@ -91,7 +91,7 @@ def __exit__(self, *args):
             _context_registry().get(cls).exit(self)
 
     def __call__(self, func):
-        @six.wraps(func)
+        @functools.wraps(func)
         def wrapper(*args, **kwargs):
             with self:
                 return func(*args, **kwargs)
diff --git a/caffe2/python/experiment_util.py b/caffe2/python/experiment_util.py
index 822a0a2950ba..6084312df84f 100644
--- a/caffe2/python/experiment_util.py
+++ b/caffe2/python/experiment_util.py
@@ -10,7 +10,6 @@
 import logging
 import socket
 import abc
-import six
 
 from collections import OrderedDict
 from future.utils import viewkeys, viewvalues
@@ -26,7 +25,7 @@
 
 
 class ExternalLogger(object):
-    six.add_metaclass(abc.ABCMeta)
+    __metaclass__ = abc.ABCMeta
 
     @abc.abstractmethod
     def set_runtime_args(self, runtime_args):
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index 5df45c6b6405..0fc489d77273 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -50,7 +50,6 @@
 import logging
 import numpy as np
 import os
-import six
 import struct
 
 
@@ -748,5 +747,5 @@ def assertRunOpRaises(
             if regexp is None:
                 self.assertRaises(exception, workspace.RunOperatorOnce, op)
             else:
-                six.assertRaisesRegex(
-                    self, exception, regexp, workspace.RunOperatorOnce, op)
+                self.assertRaisesRegex(
+                    exception, regexp, workspace.RunOperatorOnce, op)
diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py
index 7c3dda3b320c..9d825f3827b9 100644
--- a/caffe2/python/layer_model_helper.py
+++ b/caffe2/python/layer_model_helper.py
@@ -22,7 +22,6 @@
 
 import logging
 import numpy as np
-import six
 import copy
 logger = logging.getLogger(__name__)
 
@@ -125,7 +124,7 @@ def filter_metrics_schema(self, white_set):
 
     def add_ad_hoc_plot_blob(self, blob, dtype=None):
         assert isinstance(
-            blob, (six.string_types, core.BlobReference)
+            blob, (str, core.BlobReference)
         ), "expect type str or BlobReference, but got {}".format(type(blob))
         dtype = dtype or (np.float, (1, ))
         self.add_metric_field(str(blob), schema.Scalar(dtype, blob))
@@ -173,7 +172,7 @@ def initializer(blob_name):
     def add_global_constant(
         self, name, array=None, dtype=None, initializer=None
     ):
-        assert isinstance(name, six.string_types), (
+        assert isinstance(name, str), (
             'name should be a string as we are using it as map key')
         # This is global namescope for constants. They will be created in all
         # init_nets and there should be very few of them.
@@ -310,7 +309,7 @@ def create_param(self, param_name, shape, initializer, optimizer=None,
                      ps_param=None, regularizer=None):
         if isinstance(param_name, core.BlobReference):
             param_name = str(param_name)
-        elif isinstance(param_name, six.string_types):
+        elif isinstance(param_name, str):
             # Parameter name will be equal to current Namescope that got
             # resolved with the respect of parameter sharing of the scopes.
             param_name = parameter_sharing_context.get_parameter_name(
@@ -750,6 +749,6 @@ def breakdown_map(self, breakdown_map):
         # TODO(xlwang): provide more rich feature information in breakdown_map;
         # and change the assertion accordingly
         assert isinstance(breakdown_map, dict)
-        assert all(isinstance(k, six.string_types) for k in breakdown_map)
+        assert all(isinstance(k, str) for k in breakdown_map)
         assert sorted(breakdown_map.values()) == list(range(len(breakdown_map)))
         self._breakdown_map = breakdown_map
diff --git a/caffe2/python/layer_parameter_sharing_test.py b/caffe2/python/layer_parameter_sharing_test.py
index 518412b9e90c..8e1831a2ff35 100644
--- a/caffe2/python/layer_parameter_sharing_test.py
+++ b/caffe2/python/layer_parameter_sharing_test.py
@@ -9,7 +9,6 @@
 )
 from caffe2.python.optimizer import AdagradOptimizer, AdamOptimizer
 from caffe2.python.layer_test_util import LayersTestCase
-import six
 
 
 class ParameterSharingTest(LayersTestCase):
@@ -116,7 +115,7 @@ def test_layer_shared_parameter_name_different_shapes(self):
                 self.assertEquals(self.model.layers[-1].w,
                                   'global_scope/fc/w')
 
-                with six.assertRaisesRegex(self, ValueError, 'Got inconsistent shapes .*'):
+                with self.assertRaisesRegex(ValueError, 'Got inconsistent shapes .*'):
                     self.model.FC(
                         self.model.input_feature_schema.float_features,
                         output_dims + 1
diff --git a/caffe2/python/layers/functional.py b/caffe2/python/layers/functional.py
index c6d156fd68ce..bc47c474ac8f 100644
--- a/caffe2/python/layers/functional.py
+++ b/caffe2/python/layers/functional.py
@@ -11,7 +11,6 @@
 )
 import caffe2.proto.caffe2_pb2 as caffe2_pb2
 import numpy as np
-import six
 import logging
 
 logger = logging.getLogger(__name__)
@@ -31,7 +30,7 @@ def __init__(self, model, input_record, output_names_or_num, function,
         self._kwargs = kwargs
         return_struct = (
             isinstance(output_names_or_num, list) or
-            (isinstance(output_names_or_num, six.integer_types) and
+            (isinstance(output_names_or_num, int) and
              output_names_or_num != 1)
         )
 
diff --git a/caffe2/python/layers/sampling_trainable_mixin.py b/caffe2/python/layers/sampling_trainable_mixin.py
index 403cc5a4a51c..79c928d21252 100644
--- a/caffe2/python/layers/sampling_trainable_mixin.py
+++ b/caffe2/python/layers/sampling_trainable_mixin.py
@@ -6,10 +6,9 @@
 
 
 import abc
-import six
 
 
-class SamplingTrainableMixin(six.with_metaclass(abc.ABCMeta, object)):
+class SamplingTrainableMixin(metaclass=abc.ABCMeta):
 
     def __init__(self, *args, **kwargs):
         super(SamplingTrainableMixin, self).__init__(*args, **kwargs)
diff --git a/caffe2/python/layers/tags.py b/caffe2/python/layers/tags.py
index b36f9a99cd13..613fdbe8f45d 100644
--- a/caffe2/python/layers/tags.py
+++ b/caffe2/python/layers/tags.py
@@ -5,7 +5,7 @@
 
 
 
-import six
+import functools
 
 from caffe2.python import context
 
@@ -104,7 +104,7 @@ def __exit__(self, type, value, traceback):
         TagContext.current().remove_tags(self.tags)
 
     def __call__(self, func):
-        @six.wraps(func)
+        @functools.wraps(func)
         def wrapper(*args, **kwargs):
             with self:
                 return func(*args, **kwargs)
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
index a5a4865c0ec1..5eb81d898b33 100644
--- a/caffe2/python/model_helper.py
+++ b/caffe2/python/model_helper.py
@@ -21,7 +21,6 @@
 from itertools import chain
 
 import logging
-import six
 
 
 # _known_working_ops are operators that do not need special care.
@@ -199,7 +198,7 @@ def create_param(self, param_name, shape, initializer, tags=None):
         # ParameterSharing will be applied.
         if isinstance(param_name, core.BlobReference):
             param_name = str(param_name)
-        elif isinstance(param_name, six.string_types):
+        elif isinstance(param_name, str):
             # Parameter name will be equal to current Namescope that got
             # resolved with the respect of parameter sharing of the scopes.
             param_name = parameter_sharing_context.get_parameter_name(
diff --git a/caffe2/python/modeling/initializers.py b/caffe2/python/modeling/initializers.py
index b3e4b1a44dd7..ba4236d04654 100644
--- a/caffe2/python/modeling/initializers.py
+++ b/caffe2/python/modeling/initializers.py
@@ -6,8 +6,6 @@
 from caffe2.python.core import DataType, BlobReference, ScopedBlobReference
 from caffe2.python.modeling.parameter_info import ParameterInfo
 
-import six
-
 
 class Initializer(object):
     '''
@@ -47,7 +45,7 @@ class ExternalInitializer(object):
     def create_param(self, param_name, init_net, shape):
         if isinstance(param_name, BlobReference):
             param = BlobReference(str(param_name), init_net)
-        elif isinstance(param_name, six.string_types):
+        elif isinstance(param_name, str):
             param = ScopedBlobReference(param_name, init_net)
         else:
             raise TypeError("Unsupported type for param_name")
diff --git a/caffe2/python/modeling/net_modifier.py b/caffe2/python/modeling/net_modifier.py
index e824c828e4bd..c0545fad08f5 100644
--- a/caffe2/python/modeling/net_modifier.py
+++ b/caffe2/python/modeling/net_modifier.py
@@ -4,10 +4,9 @@
 
 
 import abc
-import six
 
 
-class NetModifier(six.with_metaclass(abc.ABCMeta, object)):
+class NetModifier(metaclass=abc.ABCMeta):
     """
     An abstraction class for supporting modifying a generated net.
     Inherited classes should implement the modify_net method where
diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py
index d253b06658a3..d2efcc79823e 100644
--- a/caffe2/python/onnx/tests/c2_ref_test.py
+++ b/caffe2/python/onnx/tests/c2_ref_test.py
@@ -8,7 +8,6 @@
 
 import json
 import os
-import six
 import unittest
 
 from caffe2.python import core
@@ -44,9 +43,8 @@ def test_check_arguments(self):
         b2.convert_node(node_def.SerializeToString())
 
         bad_node_def = make_node("Add", inputs=["X", "Y"], outputs=["Z"], foo=42, bar=56)
-        with six.assertRaisesRegex(self,
-                                   RuntimeError,
-                                   "Don't know how to map unexpected argument (foo|bar)"):
+        with self.assertRaisesRegex(RuntimeError,
+                                    "Don't know how to map unexpected argument (foo|bar)"):
             b2.convert_node(bad_node_def.SerializeToString())
 
     def test_dynamicslice_3inputs_graph(self):
diff --git a/caffe2/python/onnx/tests/conversion_test.py b/caffe2/python/onnx/tests/conversion_test.py
index fe73ee6a1039..1bb457491b85 100644
--- a/caffe2/python/onnx/tests/conversion_test.py
+++ b/caffe2/python/onnx/tests/conversion_test.py
@@ -6,7 +6,6 @@
 
 
 import json
-import six
 import tempfile
 import textwrap
 import traceback
@@ -82,9 +81,9 @@ def test_caffe2_to_onnx_value_info(self):
         caffe2_net.flush()
 
         args = [caffe2_net.name, '--output', output.name]
-        six.assertRaisesRegex(self, Exception,
-                              'value info',
-                              self._run_command, caffe2_to_onnx, args)
+        self.assertRaisesRegex(Exception,
+                               'value info',
+                               self._run_command, caffe2_to_onnx, args)
 
         args.extend([
             '--value-info',
diff --git a/caffe2/python/operator_test/image_input_op_test.py b/caffe2/python/operator_test/image_input_op_test.py
index 0de1f0ad048b..6bed69af9ae0 100644
--- a/caffe2/python/operator_test/image_input_op_test.py
+++ b/caffe2/python/operator_test/image_input_op_test.py
@@ -13,7 +13,7 @@
 from PIL import Image
 import numpy as np
 import shutil
-import six
+import io
 import sys
 import tempfile
 
@@ -134,7 +134,7 @@ def create_test(output_dir, width, height, default_bound, minsize, crop, means,
             img_array = np.random.random_integers(
                 0, 255, [height, width, 3]).astype(np.uint8)
             img_obj = Image.fromarray(img_array)
-            img_str = six.BytesIO()
+            img_str = io.BytesIO()
             img_obj.save(img_str, 'PNG')
 
             # Create a random bounding box for every other image
diff --git a/caffe2/python/operator_test/reshape_ops_test.py b/caffe2/python/operator_test/reshape_ops_test.py
index a42f00bbf82f..dc90b6815f01 100644
--- a/caffe2/python/operator_test/reshape_ops_test.py
+++ b/caffe2/python/operator_test/reshape_ops_test.py
@@ -3,7 +3,6 @@
 
 
 import numpy as np
-import six
 from numpy.testing import assert_array_equal
 
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/utility_ops_test.py b/caffe2/python/operator_test/utility_ops_test.py
index 241d1e4c1b56..aeefbf596afe 100644
--- a/caffe2/python/operator_test/utility_ops_test.py
+++ b/caffe2/python/operator_test/utility_ops_test.py
@@ -11,7 +11,6 @@
 import hypothesis.strategies as st
 import numpy as np
 import random
-import six
 
 
 class TestUtilityOps(serial.SerializedTestCase):
@@ -474,7 +473,7 @@ def test_range(self, gc, dc):
             names[len(inputs) - 1],
             ["Y"]
         )
-        with six.assertRaisesRegex(self, RuntimeError, 'Step size cannot be 0'):
+        with self.assertRaisesRegex(RuntimeError, 'Step size cannot be 0'):
             self.assertReferenceChecks(
                 device_option=gc,
                 op=op,
diff --git a/caffe2/python/parallel_workers.py b/caffe2/python/parallel_workers.py
index 4ee446610bdb..067f4794a89f 100644
--- a/caffe2/python/parallel_workers.py
+++ b/caffe2/python/parallel_workers.py
@@ -38,7 +38,6 @@
 import atexit
 import time
 import collections
-import six
 import traceback
 
 from abc import ABCMeta, abstractmethod
@@ -110,7 +109,7 @@ def put_metric(self, key, value, count=True):
 
 
 class State():
-    six.add_metaclass(ABCMeta)
+    __metaclass__ = ABCMeta
 
     @abstractmethod
     def start(self):
diff --git a/caffe2/python/python_op_test.py b/caffe2/python/python_op_test.py
index 893671b96f45..4b39adc3f36a 100644
--- a/caffe2/python/python_op_test.py
+++ b/caffe2/python/python_op_test.py
@@ -8,7 +8,6 @@
 from hypothesis import given, settings
 import hypothesis.strategies as st
 import numpy as np
-import six
 
 
 class CustomError(Exception):
@@ -55,12 +54,12 @@ def f(inputs, _):
 
     def test_exception(self):
         op = CreatePythonOperator(MainOpFunctionThatThrowsCustomError, [], [])
-        with six.assertRaisesRegex(self, CustomError, "This is an intentional exception."):
+        with self.assertRaisesRegex(CustomError, "This is an intentional exception."):
             workspace.RunOperatorOnce(op)
 
     def test_exception_builder(self):
         op = CreatePythonOperator(MainOpFunctionThatThrowsCustomErrorInBuilder, [], [])
-        with six.assertRaisesRegex(self, CustomError, "This is an intentional exception in builder."):
+        with self.assertRaisesRegex(CustomError, "This is an intentional exception in builder."):
             workspace.RunOperatorOnce(op)
 
     @given(x=hu.tensor())
diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py
index aee934c31826..9c85d0efd2a5 100644
--- a/caffe2/python/rnn_cell.py
+++ b/caffe2/python/rnn_cell.py
@@ -11,7 +11,6 @@
 import logging
 import numpy as np
 import random
-import six
 from future.utils import viewkeys
 
 from caffe2.proto import caffe2_pb2
@@ -32,7 +31,7 @@
 def _RectifyName(blob_reference_or_name):
     if blob_reference_or_name is None:
         return None
-    if isinstance(blob_reference_or_name, six.string_types):
+    if isinstance(blob_reference_or_name, str):
         return core.ScopedBlobReference(blob_reference_or_name)
     if not isinstance(blob_reference_or_name, core.BlobReference):
         raise Exception("Unknown blob reference type")
diff --git a/torch/utils/tensorboard/_caffe2_graph.py b/torch/utils/tensorboard/_caffe2_graph.py
index 218f2382c86c..3cd3a3608fed 100644
--- a/torch/utils/tensorboard/_caffe2_graph.py
+++ b/torch/utils/tensorboard/_caffe2_graph.py
@@ -2,7 +2,6 @@
 import logging
 import os
 import re
-import six
 
 from tensorboard.compat.proto.graph_pb2 import GraphDef
 from tensorboard.compat.proto.node_def_pb2 import NodeDef
@@ -162,7 +161,7 @@ def _remap_keys(old_dict, rename_fn):
         None. Modifies old_dict in-place.
     '''
     new_dict = {rename_fn(key): value for key,
-                value in six.iteritems(old_dict)}
+                value in old_dict.items()}
     old_dict.clear()
     old_dict.update(new_dict)
 
diff --git a/torch/utils/tensorboard/_convert_np.py b/torch/utils/tensorboard/_convert_np.py
index 465eba41e859..0e8fd663f106 100644
--- a/torch/utils/tensorboard/_convert_np.py
+++ b/torch/utils/tensorboard/_convert_np.py
@@ -3,7 +3,6 @@
 """
 import numpy as np
 import torch
-import six
 
 
 def make_np(x):
@@ -16,7 +15,7 @@ def make_np(x):
     """
     if isinstance(x, np.ndarray):
         return x
-    if isinstance(x, six.string_types):  # Caffe2 will pass name of blob(s) to fetch
+    if isinstance(x, str):  # Caffe2 will pass name of blob(s) to fetch
         return _prepare_caffe2(x)
     if np.isscalar(x):
         return np.array([x])
diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py
index 7c2cef41cab7..1164839adf83 100644
--- a/torch/utils/tensorboard/writer.py
+++ b/torch/utils/tensorboard/writer.py
@@ -2,7 +2,6 @@
 consumed by TensorBoard for visualization."""
 
 import os
-import six
 import time
 import torch
 
@@ -243,7 +242,7 @@ def _check_caffe2_blob(self, item):
         workspace.FetchBlob(blob_name)
         workspace.FetchBlobs([blob_name1, blob_name2, ...])
         """
-        return isinstance(item, six.string_types)
+        return isinstance(item, str)
 
     def _get_file_writer(self):
         """Returns the default FileWriter instance. Recreates it if closed."""
@@ -424,7 +423,7 @@ def add_histogram(self, tag, values, global_step=None, bins='tensorflow', wallti
         if self._check_caffe2_blob(values):
             from caffe2.python import workspace
             values = workspace.FetchBlob(values)
-        if isinstance(bins, six.string_types) and bins == 'tensorflow':
+        if isinstance(bins, str) and bins == 'tensorflow':
             bins = self.default_bins
         self._get_file_writer().add_summary(
             histogram(tag, values, bins, max_bins=max_bins), global_step, walltime)

From 461aafe389b7b5b415c3b4878abaa0ba1769f132 Mon Sep 17 00:00:00 2001
From: Kshiteej K <kshitijkalambarkar@gmail.com>
Date: Tue, 22 Dec 2020 18:41:13 -0800
Subject: [PATCH 22/45] [numpy] `torch.angle`: promote integer inputs to float
 (#49163)

Summary:
**BC-Breaking Note:**

This PR updates PyTorch's angle operator to be consistent with NumPy's. Previously angle would return zero for all floating point values (including NaN). Now angle returns `pi` for negative floating point values, zero for non-negative floating point values, and propagates NaNs.

**PR Summary:**

Reference: https://github.com/pytorch/pytorch/issues/42515

TODO:

* [x] Add BC-Breaking Note (Prev all real numbers returned `0` (even `nan`)) -> Fixed to match the correct behavior of NumPy.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49163

Reviewed By: ngimel

Differential Revision: D25681758

Pulled By: mruberry

fbshipit-source-id: 54143fe6bccbae044427ff15d8daaed3596f9685
---
 aten/src/ATen/cpu/vec256/vec256_base.h         |  2 +-
 aten/src/ATen/cpu/vec256/vec256_bfloat16.h     | 18 +++++++++++++++++-
 aten/src/ATen/cpu/vec256/vec256_double.h       | 11 ++++++++++-
 aten/src/ATen/cpu/vec256/vec256_float.h        | 11 ++++++++++-
 aten/src/ATen/cpu/vec256/vec256_int.h          | 12 ------------
 aten/src/ATen/native/UnaryOps.cpp              | 18 ++++++++++++++----
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp    |  2 +-
 aten/src/ATen/native/cpu/zmath.h               | 11 ++++++++++-
 .../ATen/native/cuda/UnaryComplexKernels.cu    |  8 ++++++--
 test/test_torch.py                             |  1 -
 test/test_unary_ufuncs.py                      | 12 +++++++-----
 torch/_torch_docs.py                           |  5 +++++
 .../_internal/common_methods_invocations.py    | 15 +++++++++++++--
 13 files changed, 94 insertions(+), 32 deletions(-)

diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
index 2449647d848c..477e366ea18b 100644
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -251,7 +251,7 @@ struct Vec256 {
   Vec256<T> angle() const {
     // other_t_angle is for SFINAE and clarity. Make sure it is not changed.
     static_assert(std::is_same<other_t_angle, T>::value, "other_t_angle must be T");
-    return Vec256(0);
+    return map(at::native::angle_impl<T>);  // compiler is unable to resolve the overload without <T>
   }
   template <typename complex_t_angle = T,
             typename std::enable_if<c10::is_complex<complex_t_angle>::value, int>::type = 0>
diff --git a/aten/src/ATen/cpu/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec256/vec256_bfloat16.h
index 43389fe61583..dbe9cf374d95 100644
--- a/aten/src/ATen/cpu/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec256/vec256_bfloat16.h
@@ -203,7 +203,23 @@ template <> class Vec256<BFloat16> {
     return cvtfp32_bf16(o1, o2);
   }
   Vec256<BFloat16> angle() const {
-    return _mm256_set1_epi16(0);
+    __m256 lo, hi;
+    cvtbf16_fp32(values, lo, hi);
+    auto angle_lambda = [](__m256 values) {
+      const auto zero_vec = _mm256_set1_ps(0.f);
+      const auto nan_vec = _mm256_set1_ps(NAN);
+      const auto not_nan_mask = _mm256_cmp_ps(values, values, _CMP_EQ_OQ);
+      const auto nan_mask = _mm256_cmp_ps(not_nan_mask, zero_vec, _CMP_EQ_OQ);
+      const auto pi = _mm256_set1_ps(M_PI);
+
+      const auto neg_mask = _mm256_cmp_ps(values, zero_vec, _CMP_LT_OQ);
+      auto angle = _mm256_blendv_ps(zero_vec, pi, neg_mask);
+      angle = _mm256_blendv_ps(angle, nan_vec, nan_mask);
+      return angle;
+    };
+    auto o1 = angle_lambda(lo);
+    auto o2 = angle_lambda(hi);
+    return cvtfp32_bf16(o1, o2);
   }
   Vec256<BFloat16> real() const {
     return *this;
diff --git a/aten/src/ATen/cpu/vec256/vec256_double.h b/aten/src/ATen/cpu/vec256/vec256_double.h
index b372546d3b6a..0bea07dbf592 100644
--- a/aten/src/ATen/cpu/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_double.h
@@ -108,7 +108,16 @@ template <> class Vec256<double> {
     return _mm256_andnot_pd(mask, values);
   }
   Vec256<double> angle() const {
-    return _mm256_set1_pd(0);
+    const auto zero_vec = _mm256_set1_pd(0.f);
+    const auto nan_vec = _mm256_set1_pd(NAN);
+    const auto not_nan_mask = _mm256_cmp_pd(values, values, _CMP_EQ_OQ);
+    const auto nan_mask = _mm256_cmp_pd(not_nan_mask, zero_vec, _CMP_EQ_OQ);
+    const auto pi = _mm256_set1_pd(M_PI);
+    
+    const auto neg_mask = _mm256_cmp_pd(values, zero_vec, _CMP_LT_OQ);
+    auto angle = _mm256_blendv_pd(zero_vec, pi, neg_mask);
+    angle = _mm256_blendv_pd(angle, nan_vec, nan_mask);
+    return angle;
   }
   Vec256<double> real() const {
     return *this;
diff --git a/aten/src/ATen/cpu/vec256/vec256_float.h b/aten/src/ATen/cpu/vec256/vec256_float.h
index c83d2c12f95a..a8fd65b0ba79 100644
--- a/aten/src/ATen/cpu/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float.h
@@ -115,7 +115,16 @@ template <> class Vec256<float> {
     return _mm256_andnot_ps(mask, values);
   }
   Vec256<float> angle() const {
-    return _mm256_set1_ps(0);
+    const auto zero_vec = _mm256_set1_ps(0.f);
+    const auto nan_vec = _mm256_set1_ps(NAN);
+    const auto not_nan_mask = _mm256_cmp_ps(values, values, _CMP_EQ_OQ);
+    const auto nan_mask = _mm256_cmp_ps(not_nan_mask, zero_vec, _CMP_EQ_OQ);
+    const auto pi = _mm256_set1_ps(M_PI);
+    
+    const auto neg_mask = _mm256_cmp_ps(values, zero_vec, _CMP_LT_OQ);
+    auto angle = _mm256_blendv_ps(zero_vec, pi, neg_mask);
+    angle = _mm256_blendv_ps(angle, nan_vec, nan_mask);
+    return angle;
   }
   Vec256<float> real() const {
     return *this;
diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h
index 30bf6421adb3..2ba2744d3526 100644
--- a/aten/src/ATen/cpu/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec256/vec256_int.h
@@ -121,9 +121,6 @@ class Vec256<int64_t> : public Vec256i {
     auto inverse = _mm256_xor_si256(values, is_larger);
     return _mm256_sub_epi64(inverse, is_larger);
   }
-  Vec256<int64_t> angle() const {
-    return _mm256_set1_epi64x(0);
-  }
   Vec256<int64_t> real() const {
     return *this;
   }
@@ -250,9 +247,6 @@ class Vec256<int32_t> : public Vec256i {
   Vec256<int32_t> abs() const {
     return _mm256_abs_epi32(values);
   }
-  Vec256<int32_t> angle() const {
-    return _mm256_set1_epi32(0);
-  }
   Vec256<int32_t> real() const {
     return *this;
   }
@@ -467,9 +461,6 @@ class Vec256<int16_t> : public Vec256i {
   Vec256<int16_t> abs() const {
     return _mm256_abs_epi16(values);
   }
-  Vec256<int16_t> angle() const {
-    return _mm256_set1_epi16(0);
-  }
   Vec256<int16_t> real() const {
     return *this;
   }
@@ -719,9 +710,6 @@ class Vec256<int8_t> : public Vec256i {
   Vec256<int8_t> abs() const {
     return _mm256_abs_epi8(values);
   }
-  Vec256<int8_t> angle() const {
-    return _mm256_set1_epi8(0);
-  }
   Vec256<int8_t> real() const {
     return *this;
   }
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 89061ebeff0a..7ee381dc4374 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -66,7 +66,7 @@ Tensor unary_op_impl_float(const Tensor& self, Stub& stub) {
 // Note: This is done by running the operation as usual and then copying the
 // operation's result to the expected result type.
 template <typename Stub>
-static inline Tensor& unary_op_impl_with_complex_to_float_out(Tensor& result, const Tensor& self, Stub& stub) {
+static inline Tensor& unary_op_impl_with_complex_to_float_out(Tensor& result, const Tensor& self, Stub& stub, bool promotes_integer_to_float) {
     if (self.is_complex() && !result.is_complex()) {
       // Checks if the corresponding float type can be cast to the desired dtype
       const auto float_type = c10::toValueType(self.scalar_type());
@@ -85,6 +85,10 @@ static inline Tensor& unary_op_impl_with_complex_to_float_out(Tensor& result, co
       return result;
     }
 
+    if (promotes_integer_to_float) {
+      return unary_op_impl_float_out(result, self, stub);
+    }
+
     return unary_op_impl_out(result, self, stub);
 }
 
@@ -173,7 +177,7 @@ Tensor& arctan_(Tensor& self) { return self.atan_(); }
 // complex input. This makes sense mathematically since the absolute value
 // and angle of a complex number has no imaginary part.
 Tensor& abs_out(Tensor& result, const Tensor& self) {
-  return unary_op_impl_with_complex_to_float_out(result, self, abs_stub);
+  return unary_op_impl_with_complex_to_float_out(result, self, abs_stub, /*promotes_integer_to_float=*/false);
 }
 Tensor abs(const Tensor& self) {
   return unary_op_impl_with_complex_to_float(self, at::abs_out);
@@ -195,10 +199,16 @@ Tensor& absolute_(Tensor& self) {
 }
 
 Tensor& angle_out(Tensor& result, const Tensor& self) {
-  return unary_op_impl_with_complex_to_float_out(result, self, angle_stub);
+  return unary_op_impl_with_complex_to_float_out(result, self, angle_stub, /*promotes_integer_to_float=*/true);
 }
 Tensor angle(const Tensor& self) {
-  return unary_op_impl_with_complex_to_float(self, at::angle_out);
+  if (self.is_complex()) {
+    const auto float_type = c10::toValueType(self.scalar_type());
+    Tensor result = at::empty({0}, self.options().dtype(float_type));
+    return at::angle_out(result, self);
+  }
+
+  return unary_op_impl_float(self, angle_stub);
 }
 
 Tensor real(const Tensor& self) {
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 453ed91047c5..42a761439ac0 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -167,7 +167,7 @@ static void abs_kernel(TensorIterator& iter) {
 }
 
 static void angle_kernel(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "angle_cpu", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "angle_cpu", [&]() {
     cpu_kernel_vec(
         iter,
         [=](scalar_t a) -> scalar_t { return angle_impl(a); },
diff --git a/aten/src/ATen/native/cpu/zmath.h b/aten/src/ATen/native/cpu/zmath.h
index e0554e0cbc29..7be18ef519b1 100644
--- a/aten/src/ATen/native/cpu/zmath.h
+++ b/aten/src/ATen/native/cpu/zmath.h
@@ -33,9 +33,18 @@ inline double zabs <c10::complex<double>, double> (c10::complex<double> z) {
   return std::abs(z);
 }
 
+// This overload corresponds to non-complex dtypes.
+// The function is consistent with its NumPy equivalent
+// for non-complex dtypes where `pi` is returned for
+// negative real numbers and `0` is returned for 0 or positive
+// real numbers.
+// Note: `nan` is propagated.
 template <typename SCALAR_TYPE, typename VALUE_TYPE=SCALAR_TYPE>
 inline VALUE_TYPE angle_impl (SCALAR_TYPE z) {
-  return 0;
+  if (at::_isnan(z)) {
+    return z;
+  }
+  return z < 0 ? M_PI : 0;
 }
 
 template<>
diff --git a/aten/src/ATen/native/cuda/UnaryComplexKernels.cu b/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
index 30fa3dc90176..6e192b51494f 100644
--- a/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
@@ -3,6 +3,7 @@
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/Context.h>
 #include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 
@@ -11,7 +12,10 @@ namespace at { namespace native {
 // We manually overload angle because std::arg does not work with types other than c10::complex.
 template<typename scalar_t>
 __host__ __device__ static inline scalar_t angle_wrapper(scalar_t v) {
-  return 0;
+  if (at::_isnan(v)){
+    return v;
+  }
+  return v < 0 ? M_PI : 0;
 }
 
 template<typename T>
@@ -20,7 +24,7 @@ __host__ __device__ static inline c10::complex<T> angle_wrapper(c10::complex<T>
 }
 
 void angle_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "angle_cuda", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.common_dtype(), "angle_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return angle_wrapper(a);
     });
diff --git a/test/test_torch.py b/test/test_torch.py
index b4d9ad6f23c0..294aa3bfa920 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6655,7 +6655,6 @@ def inner(self, device, dtype):
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True,
         [_wrap_maybe_warns("This overload of addmv_? is deprecated")]),
     ('atan2', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-2, 1e-5, 1e-5, _types, _types_no_half),
-    ('angle', '', _small_3d, lambda t, d: [], 0, 0, 0, _types_no_half, [torch.bfloat16], False),
     ('fmod', 'value', _small_3d, lambda t, d: [3], 1e-3),
     ('fmod', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d, has_zeros=False)], 1e-3),
     ('chunk', '', _medium_2d, lambda t, d: [4], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index cfe8fe9c1a04..c96788f113eb 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -212,8 +212,7 @@ def _fn(t):
         for alt, inplace in ((op.get_method(), False), (op.get_inplace(), True),
                              (torch.jit.script(_fn), False)):
             if alt is None:
-                with self.assertRaises(RuntimeError):
-                    alt(t.clone())
+                continue
 
             if inplace and op.promotes_integers_to_float and dtype in integral_types() + (torch.bool,):
                 # Assert that RuntimeError is raised
@@ -426,9 +425,12 @@ def compare_out(op, input, out):
         if out_dtype.is_floating_point and not dtype.is_complex:
             compare_out(op, input, output)
         elif out_dtype.is_floating_point and dtype.is_complex:
-            # Can't cast complex to float
-            with self.assertRaises(RuntimeError):
-                op(input, out=output)
+            if op.supports_complex_to_float:
+                compare_out(op, input, output)
+            else:
+                # Can't cast complex to float
+                with self.assertRaises(RuntimeError):
+                    op(input, out=output)
         elif out_dtype.is_complex:
             compare_out(op, input, output)
         else:
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index ae0ffd916e51..073883b60407 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -625,6 +625,11 @@ def merge_dicts(*dicts):
 Keyword args:
     {out}
 
+.. note:: Starting in PyTorch 1.8, angle returns pi for negative real numbers,
+          zero for non-negative real numbers, and propagates NaNs. Previously
+          the function would return zero for all real numbers and not propagate
+          floating-point NaNs.
+
 Example::
 
     >>> torch.angle(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))*180/3.14159
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 55b97b38a4da..72a167aa7ca9 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -253,6 +253,7 @@ def __init__(self,
                  handles_large_floats=True,  # whether the op correctly handles large float values (like 1e20)
                  handles_extremals=True,  # whether the op correctly handles extremal values (like inf)
                  handles_complex_extremals=True,  # whether the op correct handles complex extremals (like inf -infj)
+                 supports_complex_to_float=False,  # op supports casting from complex input to real output safely eg. angle
                  sample_inputs_func=sample_inputs_unary,
                  **kwargs):
         super(UnaryUfuncInfo, self).__init__(name,
@@ -267,6 +268,7 @@ def __init__(self,
         self.handles_large_floats = handles_large_floats
         self.handles_extremals = handles_extremals
         self.handles_complex_extremals = handles_complex_extremals
+        self.supports_complex_to_float = supports_complex_to_float
 
         # Epsilon to ensure grad and gradgrad checks don't test values
         #   outside a function's domain.
@@ -1011,6 +1013,17 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                                 dtypes=[torch.bfloat16])),
                    promotes_integers_to_float=True,
                    handles_complex_extremals=False),
+    UnaryUfuncInfo('angle',
+                   ref=np.angle,
+                   dtypes=all_types_and_complex_and(torch.bool),
+                   dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool),
+                   dtypesIfROCM=all_types_and_complex_and(torch.bool),
+                   decorators=(precisionOverride({torch.float16: 1e-2,
+                                                  torch.bfloat16: 1e-2}),),
+                   promotes_integers_to_float=True,
+                   supports_complex_to_float=True,
+                   test_inplace_grad=False),
     OpInfo('linalg.solve',
            aten_name='linalg_solve',
            op=torch.linalg.solve,
@@ -1389,8 +1402,6 @@ def method_tests():
         ('complex', (S, S, S), ((S, S, S),), ''),
         ('abs', (S, S, S), NO_ARGS, '', (True,)),
         ('abs', (), NO_ARGS, 'scalar', (True,)),
-        ('angle', (S, S, S), NO_ARGS, '', (True,)),
-        ('angle', (), NO_ARGS, 'scalar', (True,)),
         ('clamp', (S, S, S), (0, 1), '', (True,)),
         ('clamp', (S, S, S), (None, 0.5), 'min', (True,)),
         ('clamp', (S, S, S), (0.5, None), 'max', (True,)),

From 68d438c9dade66073b3f9657bc077623c22001b9 Mon Sep 17 00:00:00 2001
From: Joel Schlosser <jbschlosser@fb.com>
Date: Tue, 22 Dec 2020 20:12:40 -0800
Subject: [PATCH 23/45] Add PixelUnshuffle (#49334)

Summary:
Adds an implementation of `torch.nn.PixelUnshuffle` as the inverse operation of `torch.nn.PixelShuffle`. This addresses https://github.com/pytorch/pytorch/issues/2456

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49334

Test Plan:
```
# Unit tests.
python test/test_nn.py TestNN.test_pixel_shuffle_unshuffle

# Module test.
python test/test_nn.py TestNN.test_PixelUnshuffle

# C++ API tests.
build/bin/test_api

# C++ / python parity tests.
python test/test_cpp_api_parity.py

# JIT test.
python test/test_jit.py TestJitGeneratedFunctional.test_nn_pixel_unshuffle

# Override tests.
python test/test_overrides.py

# Type hint tests.
python test/test_type_hints.py
```

Screenshots of rendered docs:
<img width="876" alt="Screen Shot 2020-12-18 at 12 19 05 PM" src="https://user-images.githubusercontent.com/75754324/102642255-6b07bb00-412b-11eb-88fa-e53e7e8ba720.png">
<img width="984" alt="Screen Shot 2020-12-18 at 12 19 26 PM" src="https://user-images.githubusercontent.com/75754324/102642276-70fd9c00-412b-11eb-8548-445082a2db02.png">
<img width="932" alt="Screen Shot 2020-12-18 at 12 19 34 PM" src="https://user-images.githubusercontent.com/75754324/102642704-19abfb80-412c-11eb-9546-95bdd1c3cf22.png">
<img width="876" alt="Screen Shot 2020-12-22 at 12 51 36 PM" src="https://user-images.githubusercontent.com/75754324/102918259-986aa680-4454-11eb-99e7-a0b4c8b3e283.png">
<img width="869" alt="Screen Shot 2020-12-22 at 12 51 44 PM" src="https://user-images.githubusercontent.com/75754324/102918274-9ef91e00-4454-11eb-94bb-91b58aff47d3.png">

Reviewed By: mruberry

Differential Revision: D25401439

Pulled By: jbschlosser

fbshipit-source-id: 209d92ce7295e51699e83616d0c62170a7ce75c8
---
 aten/src/ATen/core/aten_interned_strings.h    |  1 +
 aten/src/ATen/native/PixelShuffle.cpp         | 83 +++++++++++++++---
 aten/src/ATen/native/native_functions.yaml    |  3 +
 docs/source/nn.functional.rst                 |  5 ++
 docs/source/nn.rst                            |  1 +
 test/cpp/api/functional.cpp                   | 17 ++++
 test/cpp/api/modules.cpp                      | 24 ++++++
 test/cpp_api_parity/parity-tracker.md         |  1 +
 test/test_nn.py                               | 86 +++++++++++++------
 tools/pyi/gen_pyi.py                          |  1 +
 .../torch/nn/functional/pixelshuffle.h        | 10 +++
 .../include/torch/nn/modules/pixelshuffle.h   | 48 +++++++++--
 .../include/torch/nn/options/pixelshuffle.h   | 26 ++++++
 .../csrc/api/src/nn/modules/pixelshuffle.cpp  | 14 +++
 torch/nn/functional.py                        | 23 ++++-
 torch/nn/modules/__init__.py                  |  4 +-
 torch/nn/modules/pixelshuffle.py              | 72 ++++++++++++++--
 torch/overrides.py                            |  1 +
 torch/testing/_internal/common_nn.py          |  6 ++
 .../_internal/jit_metaprogramming_utils.py    |  1 +
 20 files changed, 371 insertions(+), 56 deletions(-)

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index 644d75c04c06..518e74b95d54 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -553,6 +553,7 @@ _(aten, permute) \
 _(aten, pin_memory) \
 _(aten, pinverse) \
 _(aten, pixel_shuffle) \
+_(aten, pixel_unshuffle) \
 _(aten, poisson) \
 _(aten, polygamma) \
 _(aten, pow) \
diff --git a/aten/src/ATen/native/PixelShuffle.cpp b/aten/src/ATen/native/PixelShuffle.cpp
index e6301e682d77..20214470ba28 100644
--- a/aten/src/ATen/native/PixelShuffle.cpp
+++ b/aten/src/ATen/native/PixelShuffle.cpp
@@ -14,12 +14,16 @@ Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) {
   TORCH_CHECK(self.dim() >= 3,
               "pixel_shuffle expects input to have at least 3 dimensions, but got input with ",
               self.dim(), " dimension(s)");
+  TORCH_CHECK(
+      upscale_factor > 0,
+      "pixel_shuffle expects a positive upscale_factor, but got ",
+      upscale_factor);
   // Format: (B1, ..., Bn), C, H, W
   int64_t c = self.size(-3);
   int64_t h = self.size(-2);
   int64_t w = self.size(-1);
   const auto NUM_NON_BATCH_DIMS = 3;
-  const auto last_batch_dim = self.sizes().end() - NUM_NON_BATCH_DIMS;
+  const auto self_sizes_batch_end = self.sizes().end() - NUM_NON_BATCH_DIMS;
 
   int64_t upscale_factor_squared = upscale_factor * upscale_factor;
   TORCH_CHECK(c % upscale_factor_squared == 0,
@@ -29,24 +33,81 @@ Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) {
   int64_t oh = h * upscale_factor;
   int64_t ow = w * upscale_factor;
 
-  // First, reshape to expand the channels dim from c into 3 separate dims: (oc, upscale_factor, upscale_factor).
-  // This allows shuffling to be done next by permuting dims.
-  std::vector<int64_t> expanded_shape(self.sizes().begin(), last_batch_dim);
-  expanded_shape.insert(expanded_shape.end(), {oc, upscale_factor, upscale_factor, h, w});
-  const auto input_expanded = self.reshape(expanded_shape);
+  // First, reshape to split the channels dim from c into 3 separate dims: (oc,
+  // upscale_factor, upscale_factor). This allows shuffling to be done next by
+  // permuting dims.
+  std::vector<int64_t> added_dims_shape(
+      self.sizes().begin(), self_sizes_batch_end);
+  added_dims_shape.insert(
+      added_dims_shape.end(), {oc, upscale_factor, upscale_factor, h, w});
+  const auto input_reshaped = self.reshape(added_dims_shape);
 
   // Next, shuffle by permuting the new upscale_factor dims alongside the height and width dims.
-  std::vector<int64_t> permutation(self.sizes().begin(), last_batch_dim);
+  std::vector<int64_t> permutation(self.sizes().begin(), self_sizes_batch_end);
   // std::iota is used to maintain the batch dims within the permutation.
-  // Since expansion added 2 dims, the correct batch dim offsets are now: -expanded_shape.size(), ..., -7, -6.
-  std::iota(permutation.begin(), permutation.end(), -expanded_shape.size());
+  // Since 2 dims were added, the correct batch dim offsets are now:
+  // -added_dims_shape.size(), ..., -7, -6.
+  std::iota(permutation.begin(), permutation.end(), -added_dims_shape.size());
   permutation.insert(permutation.end(), {-5 /* oc */, -2 /* h */, -4 /* 1st upscale_factor */, -1 /* w */,
                                          -3 /* 2nd upscale_factor */});
-  const auto input_permuted = input_expanded.permute(permutation);
+  const auto input_permuted = input_reshaped.permute(permutation);
 
   // Finally, upscale by collapsing (h, upscale_factor) -> a single dim (oh)
   // and (w, upscale_factor) -> a single dim (ow).
-  std::vector<int64_t> final_shape(self.sizes().begin(), last_batch_dim);
+  std::vector<int64_t> final_shape(self.sizes().begin(), self_sizes_batch_end);
+  final_shape.insert(final_shape.end(), {oc, oh, ow});
+  return input_permuted.reshape(final_shape);
+}
+
+
+Tensor pixel_unshuffle(const Tensor& self, int64_t downscale_factor) {
+  TORCH_CHECK(self.dim() >= 3,
+              "pixel_unshuffle expects input to have at least 3 dimensions, but got input with ",
+              self.dim(), " dimension(s)");
+  TORCH_CHECK(
+      downscale_factor > 0,
+      "pixel_unshuffle expects a positive downscale_factor, but got ",
+      downscale_factor);
+  // Format: (B1, ..., Bn), C, H, W
+  int64_t c = self.size(-3);
+  int64_t h = self.size(-2);
+  int64_t w = self.size(-1);
+  constexpr auto NUM_NON_BATCH_DIMS = 3;
+  const auto self_sizes_batch_end = self.sizes().end() - NUM_NON_BATCH_DIMS;
+
+  TORCH_CHECK(h % downscale_factor == 0,
+             "pixel_unshuffle expects height to be divisible by downscale_factor, but input.size(-2)=", h,
+             " is not divisible by ", downscale_factor)
+  TORCH_CHECK(w % downscale_factor == 0,
+             "pixel_unshuffle expects width to be divisible by downscale_factor, but input.size(-1)=", w,
+             " is not divisible by ", downscale_factor)
+  int64_t downscale_factor_squared = downscale_factor * downscale_factor;
+  int64_t oc = c * downscale_factor_squared;
+  int64_t oh = h / downscale_factor;
+  int64_t ow = w / downscale_factor;
+
+  // First, reshape to split height dim into (oh, downscale_factor) dims and
+  // width dim into (ow, downscale_factor) dims. This allows unshuffling to be
+  // done next by permuting dims.
+  std::vector<int64_t> added_dims_shape(
+      self.sizes().begin(), self_sizes_batch_end);
+  added_dims_shape.insert(
+      added_dims_shape.end(), {c, oh, downscale_factor, ow, downscale_factor});
+  const auto input_reshaped = self.reshape(added_dims_shape);
+
+  // Next, unshuffle by permuting the downscale_factor dims alongside the channel dim.
+  std::vector<int64_t> permutation(self.sizes().begin(), self_sizes_batch_end);
+  // std::iota is used to maintain the batch dims within the permutation.
+  // Since 2 dims were added, the correct batch dim offsets are now:
+  // -added_dims_shape.size(), ..., -7, -6.
+  std::iota(permutation.begin(), permutation.end(), -added_dims_shape.size());
+  permutation.insert(permutation.end(), {-5 /* c */, -3 /* 1st downscale_factor */, -1 /*2nd downscale_factor */,
+                                         -4 /* oh */, -2 /* ow */});
+  const auto input_permuted = input_reshaped.permute(permutation);
+
+  // Finally, downscale by collapsing (c, downscale_factor, downscale_factor) -> a single dim (oc),
+  // resulting in height=oh and width=ow.
+  std::vector<int64_t> final_shape(self.sizes().begin(), self_sizes_batch_end);
   final_shape.insert(final_shape.end(), {oc, oh, ow});
   return input_permuted.reshape(final_shape);
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 9c0053f40b7e..48692e792ae3 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3342,6 +3342,9 @@
 - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
   use_c10_dispatcher: full
 
+- func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
+  use_c10_dispatcher: full
+
 - func: channel_shuffle(Tensor self, int groups) -> Tensor
   use_c10_dispatcher: full
   dispatch:
diff --git a/docs/source/nn.functional.rst b/docs/source/nn.functional.rst
index 416121cec8d6..17b0e0a80b36 100644
--- a/docs/source/nn.functional.rst
+++ b/docs/source/nn.functional.rst
@@ -496,6 +496,11 @@ Vision functions
 
 .. autofunction:: pixel_shuffle
 
+:hidden:`pixel_unshuffle`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: pixel_unshuffle
+
 :hidden:`pad`
 ~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 4e3e8437b88b..74f7994447a1 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -299,6 +299,7 @@ Vision Layers
     :template: classtemplate.rst
 
     nn.PixelShuffle
+    nn.PixelUnshuffle
     nn.Upsample
     nn.UpsamplingNearest2d
     nn.UpsamplingBilinear2d
diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 707c1bfd7ac0..d4f353f5607f 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -1487,6 +1487,23 @@ TEST_F(FunctionalTest, PixelShuffle) {
   ASSERT_TRUE(y.allclose(y_exp));
 }
 
+TEST_F(FunctionalTest, PixelUnshuffle) {
+  auto x = torch::tensor(
+      {{{{-17, 7, 19, 14}, {0, -15, -2, 0}, {-1, -3, 2, 1}, {-12, -3, 14, 9}}}},
+      torch::kFloat);
+  auto y_exp = torch::tensor(
+      {{{{-17, 19}, {-1, 2}},
+        {{7, 14}, {-3, 1}},
+        {{0, -2}, {-12, 14}},
+        {{-15, 0}, {-3, 9}}}},
+      torch::kFloat);
+  auto y = F::pixel_unshuffle(x, 2);
+
+  ASSERT_EQ(y.ndimension(), 4);
+  ASSERT_EQ(y.sizes(), torch::IntArrayRef({1, 4, 2, 2}));
+  ASSERT_TRUE(y.allclose(y_exp));
+}
+
 TEST_F(FunctionalTest, Softplus) {
   const auto size = 3;
   for (const auto beta : {0.5, 1.0, 2.0}) {
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 14ed92f9fb0d..f24f8b42a19b 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -2761,6 +2761,24 @@ TEST_F(ModulesTest, PixelShuffle) {
   ASSERT_TRUE(y.allclose(y_exp));
 }
 
+TEST_F(ModulesTest, PixelUnshuffle) {
+  PixelUnshuffle module(/*downscale_factor=*/2);
+  auto x = torch::tensor(
+      {{{{-17, 7, 19, 14}, {0, -15, -2, 0}, {-1, -3, 2, 1}, {-12, -3, 14, 9}}}},
+      torch::kFloat);
+  auto y_exp = torch::tensor(
+      {{{{-17, 19}, {-1, 2}},
+        {{7, 14}, {-3, 1}},
+        {{0, -2}, {-12, 14}},
+        {{-15, 0}, {-3, 9}}}},
+      torch::kFloat);
+  auto y = module(x);
+
+  ASSERT_EQ(y.ndimension(), 4);
+  ASSERT_EQ(y.sizes(), torch::IntArrayRef({1, 4, 2, 2}));
+  ASSERT_TRUE(y.allclose(y_exp));
+}
+
 TEST_F(ModulesTest, Softplus) {
   const auto size = 3;
   for (const auto beta : {0.5, 1.0, 2.0}) {
@@ -4764,6 +4782,12 @@ TEST_F(ModulesTest, PrettyPrintPixelShuffle) {
             "torch::nn::PixelShuffle(upscale_factor=5)");
 }
 
+TEST_F(ModulesTest, PrettyPrintPixelUnshuffle) {
+  ASSERT_EQ(
+      c10::str(PixelUnshuffle(PixelUnshuffleOptions(5))),
+      "torch::nn::PixelUnshuffle(downscale_factor=5)");
+}
+
 TEST_F(ModulesTest, PrettyPrintSoftplus) {
   ASSERT_EQ(c10::str(Softplus()),
     "torch::nn::Softplus(beta=1, threshold=20)");
diff --git a/test/cpp_api_parity/parity-tracker.md b/test/cpp_api_parity/parity-tracker.md
index 66931b6f9316..55d3f33f32b2 100644
--- a/test/cpp_api_parity/parity-tracker.md
+++ b/test/cpp_api_parity/parity-tracker.md
@@ -125,6 +125,7 @@ torch::nn::CosineEmbeddingLoss|Yes|No
 torch::nn::MultiMarginLoss|Yes|No
 torch::nn::TripletMarginLoss|Yes|No
 torch::nn::PixelShuffle|Yes|No
+torch::nn::PixelUnshuffle|Yes|No
 torch::nn::Upsample|Yes|No
 torch::nn::DataParallel|No|No
 torch::nn::parallel::DistributedDataParallel|No|No
diff --git a/test/test_nn.py b/test/test_nn.py
index 78aab89611b6..1d63be6e3075 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -6897,8 +6897,9 @@ def test_noncontig_conv_grad_cuda(self, dtype=torch.float):
         output.backward(grad.contiguous())
         self.assertEqual(result, input.grad.data, atol=dtype2prec_DONTUSE[dtype], rtol=0)
 
-    def test_pixel_shuffle(self):
-        def _test_pixel_shuffle_helper(num_input_dims, valid_channels_dim=True):
+    def test_pixel_shuffle_unshuffle(self):
+        def _test_pixel_shuffle_unshuffle_helper(num_input_dims, valid_channels_dim=True,
+                                                 upscale_factor=None):
             # Function to imperatively ensure pixels are shuffled to the correct locations.
             # Used to validate the batch operations in pixel_shuffle.
             def _verify_pixel_shuffle(input, output, upscale_factor):
@@ -6911,7 +6912,7 @@ def _verify_pixel_shuffle(input, output, upscale_factor):
                                           (c * upscale_factor ** 2)
                             self.assertEqual(output[..., c, h, w], input[..., channel_idx, height_idx, weight_idx])
 
-            upscale_factor = random.randint(2, 5)
+            upscale_factor = random.randint(2, 5) if upscale_factor is None else upscale_factor
             # If valid_channels_dim=False, add 1 to make channels dim indivisible by upscale_factor ** 2.
             channels = random.randint(1, 4) * upscale_factor ** 2 + (0 if valid_channels_dim else 1)
             height = random.randint(5, 10)
@@ -6925,47 +6926,76 @@ def _verify_pixel_shuffle(input, output, upscale_factor):
                 batch_sizes = [random.randint(1, 3) for _ in range(num_input_dims - 3)]
                 input = torch.rand(*batch_sizes, channels, height, width, requires_grad=True)
             ps = nn.PixelShuffle(upscale_factor)
+            pus = nn.PixelUnshuffle(downscale_factor=upscale_factor)
 
-            if num_input_dims >= 3 and valid_channels_dim:
+            if num_input_dims >= 3 and valid_channels_dim and upscale_factor > 0:
                 output = ps(input)
                 _verify_pixel_shuffle(input, output, upscale_factor)
                 output.backward(output.data)
                 self.assertEqual(input.data, input.grad.data)
+
+                # Ensure unshuffle properly inverts shuffle.
+                unshuffle_output = pus(output)
+                self.assertEqual(input, unshuffle_output)
             else:
                 self.assertRaises(RuntimeError, lambda: ps(input))
 
-        def test_pixel_shuffle_1D():
-            _test_pixel_shuffle_helper(num_input_dims=1)
+        def _test_pixel_unshuffle_error_case_helper(num_input_dims, valid_height_dim=True, valid_width_dim=True,
+                                                    downscale_factor=None):
+            downscale_factor = random.randint(2, 5) if downscale_factor is None else downscale_factor
+            channels = random.randint(1, 4)
+            # If valid_height_dim=False, add 1 to make height dim indivisible by downscale_factor.
+            height = random.randint(3, 5) * abs(downscale_factor) + (0 if valid_height_dim else 1)
+            # If valid_width_dim=False, add 1 to make width dim indivisible by downscale_factor.
+            width = random.randint(3, 5) * abs(downscale_factor) + (0 if valid_width_dim else 1)
+
+            if num_input_dims == 1:
+                input = torch.rand(channels, requires_grad=True)
+            elif num_input_dims == 2:
+                input = torch.rand(height, width, requires_grad=True)
+            else:
+                batch_sizes = [random.randint(1, 3) for _ in range(num_input_dims - 3)]
+                input = torch.rand(*batch_sizes, channels, height, width, requires_grad=True)
+
+            pus = nn.PixelUnshuffle(downscale_factor)
+            self.assertRaises(RuntimeError, lambda: pus(input))
+
+        def _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims):
+            # For 1D - 2D, this is an error case.
+            # For 3D - 5D, this is a success case for pixel_shuffle + pixel_unshuffle.
+            _test_pixel_shuffle_unshuffle_helper(num_input_dims=num_input_dims)
 
-        def test_pixel_shuffle_2D():
-            _test_pixel_shuffle_helper(num_input_dims=2)
+            # Error cases for pixel_shuffle.
+            _test_pixel_shuffle_unshuffle_helper(num_input_dims=num_input_dims, valid_channels_dim=False)
+            _test_pixel_shuffle_unshuffle_helper(num_input_dims=num_input_dims, upscale_factor=0)
+            _test_pixel_shuffle_unshuffle_helper(num_input_dims=num_input_dims, upscale_factor=-2)
 
-        def test_pixel_shuffle_3D_with_valid_channels_dim():
-            _test_pixel_shuffle_helper(num_input_dims=3)
+            # Error cases for pixel_unshuffle.
+            _test_pixel_unshuffle_error_case_helper(num_input_dims=num_input_dims, valid_height_dim=False)
+            _test_pixel_unshuffle_error_case_helper(num_input_dims=num_input_dims, valid_width_dim=False)
+            _test_pixel_unshuffle_error_case_helper(num_input_dims=num_input_dims, downscale_factor=0)
+            _test_pixel_unshuffle_error_case_helper(num_input_dims=num_input_dims, downscale_factor=-2)
 
-        def test_pixel_shuffle_4D_with_valid_channels_dim():
-            _test_pixel_shuffle_helper(num_input_dims=4)
+        def test_pixel_shuffle_unshuffle_1D():
+            _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=1)
 
-        def test_pixel_shuffle_5D_with_valid_channels_dim():
-            _test_pixel_shuffle_helper(num_input_dims=5)
+        def test_pixel_shuffle_unshuffle_2D():
+            _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=2)
 
-        def test_pixel_shuffle_3D_with_invalid_channels_dim():
-            _test_pixel_shuffle_helper(num_input_dims=3, valid_channels_dim=False)
+        def test_pixel_shuffle_unshuffle_3D():
+            _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=3)
 
-        def test_pixel_shuffle_4D_with_invalid_channels_dim():
-            _test_pixel_shuffle_helper(num_input_dims=4, valid_channels_dim=False)
+        def test_pixel_shuffle_unshuffle_4D():
+            _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=4)
 
-        def test_pixel_shuffle_5D_with_invalid_channels_dim():
-            _test_pixel_shuffle_helper(num_input_dims=5, valid_channels_dim=False)
+        def test_pixel_shuffle_unshuffle_5D():
+            _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=5)
 
-        test_pixel_shuffle_1D()
-        test_pixel_shuffle_2D()
-        test_pixel_shuffle_3D_with_valid_channels_dim()
-        test_pixel_shuffle_4D_with_valid_channels_dim()
-        test_pixel_shuffle_5D_with_valid_channels_dim()
-        test_pixel_shuffle_3D_with_invalid_channels_dim()
-        test_pixel_shuffle_4D_with_invalid_channels_dim()
-        test_pixel_shuffle_5D_with_invalid_channels_dim()
+        test_pixel_shuffle_unshuffle_1D()
+        test_pixel_shuffle_unshuffle_2D()
+        test_pixel_shuffle_unshuffle_3D()
+        test_pixel_shuffle_unshuffle_4D()
+        test_pixel_shuffle_unshuffle_5D()
 
     def test_elu_inplace_view(self):
         v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True)
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index d2073bec9a27..7ad514d5d067 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -210,6 +210,7 @@ def gen_nn_functional(out: str) -> None:
         'celu_',
         'rrelu_',
         'pixel_shuffle',
+        'pixel_unshuffle',
         'channel_shuffle',
         'pdist',
         'cosine_similarity',
diff --git a/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h b/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h
index 7ea98bf07d99..32161d04d806 100644
--- a/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h
+++ b/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h
@@ -16,6 +16,10 @@ inline Tensor pixel_shuffle(
     upscale_factor
   );
 }
+
+inline Tensor pixel_unshuffle(const Tensor& input, int64_t downscale_factor) {
+  return torch::pixel_unshuffle(input, downscale_factor);
+}
 } // namespace detail
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
@@ -36,6 +40,12 @@ inline Tensor pixel_shuffle(
   return detail::pixel_shuffle(input, options.upscale_factor());
 }
 
+inline Tensor pixel_unshuffle(
+    const Tensor& input,
+    const PixelUnshuffleFuncOptions& options) {
+  return detail::pixel_unshuffle(input, options.downscale_factor());
+}
+
 } // namespace functional
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h b/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h
index 98d4be45e04a..08278ea2162e 100644
--- a/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h
+++ b/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h
@@ -12,12 +12,13 @@ namespace nn {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PixelShuffle ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
-/// to a tensor of shape :math:`(*, C, H \times r, W \times r)`.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.PixelShuffle to learn
-/// about the exact behavior of this module.
+/// to a tensor of shape :math:`(*, C, H \times r, W \times r)`, where r is an
+/// upscale factor. See
+/// https://pytorch.org/docs/master/nn.html#torch.nn.PixelShuffle to learn about
+/// the exact behavior of this module.
 ///
-/// See the documentation for `torch::nn::PixelShuffleOptions` class to learn what
-/// constructor arguments are supported for this module.
+/// See the documentation for `torch::nn::PixelShuffleOptions` class to learn
+/// what constructor arguments are supported for this module.
 ///
 /// Example:
 /// ```
@@ -44,5 +45,42 @@ struct TORCH_API PixelShuffleImpl : public torch::nn::Cloneable<PixelShuffleImpl
 /// module storage semantics.
 TORCH_MODULE(PixelShuffle);
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PixelUnshuffle ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Reverses the PixelShuffle operation by rearranging elements in a tensor of
+/// shape :math:`(*, C, H \times r, W \times r)` to a tensor of shape :math:`(*,
+/// C \times r^2, H, W)`, where r is a downscale factor. See
+/// https://pytorch.org/docs/master/nn.html#torch.nn.PixelUnshuffle to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::PixelUnshuffleOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// PixelUnshuffle model(PixelUnshuffleOptions(5));
+/// ```
+struct TORCH_API PixelUnshuffleImpl
+    : public torch::nn::Cloneable<PixelUnshuffleImpl> {
+  explicit PixelUnshuffleImpl(const PixelUnshuffleOptions& options_);
+
+  /// Pretty prints the `PixelUnshuffle` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// The options with which this `Module` was constructed.
+  PixelUnshuffleOptions options;
+};
+
+/// A `ModuleHolder` subclass for `PixelUnshuffleImpl`.
+/// See the documentation for `PixelUnshuffleImpl` class to learn what methods
+/// it provides, and examples of how to use `PixelUnshuffle` with
+/// `torch::nn::PixelUnshuffleOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(PixelUnshuffle);
+
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/options/pixelshuffle.h b/torch/csrc/api/include/torch/nn/options/pixelshuffle.h
index e72e6931e49a..e28e0053e98b 100644
--- a/torch/csrc/api/include/torch/nn/options/pixelshuffle.h
+++ b/torch/csrc/api/include/torch/nn/options/pixelshuffle.h
@@ -21,6 +21,20 @@ struct TORCH_API PixelShuffleOptions {
   TORCH_ARG(int64_t, upscale_factor);
 };
 
+/// Options for the `PixelUnshuffle` module.
+///
+/// Example:
+/// ```
+/// PixelUnshuffle model(PixelUnshuffleOptions(5));
+/// ```
+struct TORCH_API PixelUnshuffleOptions {
+  /* implicit */ PixelUnshuffleOptions(int64_t downscale_factor)
+      : downscale_factor_(downscale_factor) {}
+
+  /// Factor to decrease spatial resolution by
+  TORCH_ARG(int64_t, downscale_factor);
+};
+
 namespace functional {
 /// Options for `torch::nn::functional::pixel_shuffle`.
 ///
@@ -33,6 +47,18 @@ namespace functional {
 /// F::pixel_shuffle(x, F::PixelShuffleFuncOptions(2));
 /// ```
 using PixelShuffleFuncOptions = PixelShuffleOptions;
+
+/// Options for `torch::nn::functional::pixel_unshuffle`.
+///
+/// See the documentation for `torch::nn::PixelUnshuffleOptions` class to learn
+/// what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pixel_unshuffle(x, F::PixelUnshuffleFuncOptions(2));
+/// ```
+using PixelUnshuffleFuncOptions = PixelUnshuffleOptions;
 } // namespace functional
 
 } // namespace nn
diff --git a/torch/csrc/api/src/nn/modules/pixelshuffle.cpp b/torch/csrc/api/src/nn/modules/pixelshuffle.cpp
index dd2d34655979..7062b07fe5d7 100644
--- a/torch/csrc/api/src/nn/modules/pixelshuffle.cpp
+++ b/torch/csrc/api/src/nn/modules/pixelshuffle.cpp
@@ -21,5 +21,19 @@ Tensor PixelShuffleImpl::forward(
   return F::detail::pixel_shuffle(input, options.upscale_factor());
 }
 
+PixelUnshuffleImpl::PixelUnshuffleImpl(const PixelUnshuffleOptions& options_)
+    : options(options_) {}
+
+void PixelUnshuffleImpl::pretty_print(std::ostream& stream) const {
+  stream << "torch::nn::PixelUnshuffle(downscale_factor="
+         << options.downscale_factor() << ")";
+}
+
+void PixelUnshuffleImpl::reset() {}
+
+Tensor PixelUnshuffleImpl::forward(const Tensor& input) {
+  return F::detail::pixel_unshuffle(input, options.downscale_factor());
+}
+
 } // namespace nn
 } // namespace torch
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 24bfecb49ed5..2563d4b0ba29 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -2799,7 +2799,7 @@ def multi_margin_loss(input, target, p=1, margin=1., weight=None, size_average=N
 pixel_shuffle(input, upscale_factor) -> Tensor
 
 Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)` to a
-tensor of shape :math:`(*, C, H \times r, W \times r)`.
+tensor of shape :math:`(*, C, H \times r, W \times r)`, where r is the :attr:`upscale_factor`.
 
 See :class:`~torch.nn.PixelShuffle` for details.
 
@@ -2815,6 +2815,27 @@ def multi_margin_loss(input, target, p=1, margin=1., weight=None, size_average=N
     torch.Size([1, 1, 12, 12])
 """)
 
+pixel_unshuffle = _add_docstr(torch.pixel_unshuffle, r"""
+pixel_unshuffle(input, downscale_factor) -> Tensor
+
+Reverses the :class:`~torch.nn.PixelShuffle` operation by rearranging elements in a
+tensor of shape :math:`(*, C, H \times r, W \times r)` to a tensor of shape
+:math:`(*, C \times r^2, H, W)`, where r is the :attr:`downscale_factor`.
+
+See :class:`~torch.nn.PixelUnshuffle` for details.
+
+Args:
+    input (Tensor): the input tensor
+    downscale_factor (int): factor to increase spatial resolution by
+
+Examples::
+
+    >>> input = torch.randn(1, 1, 12, 12)
+    >>> output = torch.nn.functional.pixel_unshuffle(input, 3)
+    >>> print(output.size())
+    torch.Size([1, 9, 4, 4])
+""")
+
 channel_shuffle = _add_docstr(torch.channel_shuffle, r"""
 channel_shuffle(input, groups) -> Tensor
 
diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
index 30b0d61b42d2..4911d4bef38f 100644
--- a/torch/nn/modules/__init__.py
+++ b/torch/nn/modules/__init__.py
@@ -24,7 +24,7 @@
 from .sparse import Embedding, EmbeddingBag
 from .rnn import RNNBase, RNN, LSTM, GRU, \
     RNNCellBase, RNNCell, LSTMCell, GRUCell
-from .pixelshuffle import PixelShuffle
+from .pixelshuffle import PixelShuffle, PixelUnshuffle
 from .upsampling import UpsamplingNearest2d, UpsamplingBilinear2d, Upsample
 from .distance import PairwiseDistance, CosineSimilarity
 from .fold import Fold, Unfold
@@ -50,7 +50,7 @@
     'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout',
     'ReflectionPad1d', 'ReflectionPad2d', 'ReplicationPad2d', 'ReplicationPad1d', 'ReplicationPad3d',
     'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCellBase', 'RNNCell',
-    'LSTMCell', 'GRUCell', 'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d',
+    'LSTMCell', 'GRUCell', 'PixelShuffle', 'PixelUnshuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d',
     'PairwiseDistance', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d',
     'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d', 'TripletMarginLoss', 'ZeroPad2d', 'ConstantPad1d', 'ConstantPad2d',
     'ConstantPad3d', 'Bilinear', 'CosineSimilarity', 'Unfold', 'Fold',
diff --git a/torch/nn/modules/pixelshuffle.py b/torch/nn/modules/pixelshuffle.py
index 8256b111b988..d17f5616c2e9 100644
--- a/torch/nn/modules/pixelshuffle.py
+++ b/torch/nn/modules/pixelshuffle.py
@@ -6,26 +6,30 @@
 
 class PixelShuffle(Module):
     r"""Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
-    to a tensor of shape :math:`(*, C, H \times r, W \times r)`.
+    to a tensor of shape :math:`(*, C, H \times r, W \times r)`, where r is an upscale factor.
 
     This is useful for implementing efficient sub-pixel convolution
     with a stride of :math:`1/r`.
 
-    Look at the paper:
+    See the paper:
     `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
     by Shi et. al (2016) for more details.
 
-    Note that this function can take inputs with any number of batch dimensions:
-    :math:`(L, H_{in}, W_{in})`, :math:`(N, L, H_{in}, W_{in})`, :math:`(N_1, N_2, L, H_{in}, W_{in})`, etc.
-
     Args:
         upscale_factor (int): factor to increase spatial resolution by
 
     Shape:
-        - Input: :math:`(*, L, H_{in}, W_{in})` where :math:`L=C \times \text{upscale\_factor}^2`
-        - Output: :math:`(*, C, H_{out}, W_{out})` where
-          :math:`H_{out} = H_{in} \times \text{upscale\_factor}`
-          and :math:`W_{out} = W_{in} \times \text{upscale\_factor}`
+        - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions
+        - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where
+
+    .. math::
+        C_{out} = C_{in} \div \text{upscale\_factor}^2
+
+    .. math::
+        H_{out} = H_{in} \times \text{upscale\_factor}
+
+    .. math::
+        W_{out} = W_{in} \times \text{upscale\_factor}
 
     Examples::
 
@@ -50,3 +54,53 @@ def forward(self, input: Tensor) -> Tensor:
 
     def extra_repr(self) -> str:
         return 'upscale_factor={}'.format(self.upscale_factor)
+
+
+class PixelUnshuffle(Module):
+    r"""Reverses the :class:`~torch.nn.PixelShuffle` operation by rearranging elements
+    in a tensor of shape :math:`(*, C, H \times r, W \times r)` to a tensor of shape
+    :math:`(*, C \times r^2, H, W)`, where r is a downscale factor.
+
+    See the paper:
+    `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
+    by Shi et. al (2016) for more details.
+
+    Args:
+        downscale_factor (int): factor to decrease spatial resolution by
+
+    Shape:
+        - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions
+        - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where
+
+    .. math::
+        C_{out} = C_{in} \times \text{downscale\_factor}^2
+
+    .. math::
+        H_{out} = H_{in} \div \text{downscale\_factor}
+
+    .. math::
+        W_{out} = W_{in} \div \text{downscale\_factor}
+
+    Examples::
+
+        >>> pixel_unshuffle = nn.PixelUnshuffle(3)
+        >>> input = torch.randn(1, 1, 12, 12)
+        >>> output = pixel_unshuffle(input)
+        >>> print(output.size())
+        torch.Size([1, 9, 4, 4])
+
+    .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network:
+        https://arxiv.org/abs/1609.05158
+    """
+    __constants__ = ['downscale_factor']
+    downscale_factor: int
+
+    def __init__(self, downscale_factor: int) -> None:
+        super(PixelUnshuffle, self).__init__()
+        self.downscale_factor = downscale_factor
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pixel_unshuffle(input, self.downscale_factor)
+
+    def extra_repr(self) -> str:
+        return 'downscale_factor={}'.format(self.downscale_factor)
diff --git a/torch/overrides.py b/torch/overrides.py
index d23e34831bdd..6c193b273344 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -706,6 +706,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.pdist: lambda input, p=2: -1,
         torch.pinverse: lambda input, rcond=1e-15: -1,
         torch.pixel_shuffle: lambda input, upscale_factor: -1,
+        torch.pixel_unshuffle: lambda input, downscale_factor: -1,
         torch.poisson: lambda input, generator=None: -1,
         torch.poisson_nll_loss: lambda input, target, log_input, full, eps, reduction: -1,
         torch.polygamma: lambda input, n, out=None: -1,
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 8b9a5072e50f..c588f69c2875 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -2516,6 +2516,12 @@ def fractional_max_pool3d_test(test_case):
         cpp_constructor_args='torch::nn::PixelShuffleOptions(3)',
         input_size=(1, 9, 4, 4),
     ),
+    dict(
+        module_name='PixelUnshuffle',
+        constructor_args=(3,),
+        cpp_constructor_args='torch::nn::PixelUnshuffleOptions(3)',
+        input_size=(1, 1, 12, 12),
+    ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
         cpp_options_args='''F::InterpolateFuncOptions()
diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py
index 4a91394d53c5..2acc380579e5 100644
--- a/torch/testing/_internal/jit_metaprogramming_utils.py
+++ b/torch/testing/_internal/jit_metaprogramming_utils.py
@@ -140,6 +140,7 @@
     ('multilabel_soft_margin_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
     ('cosine_embedding_loss', (S, S), ((S, S), non_differentiable(torch.rand(S,))),),
     ('pixel_shuffle', (1, 9, 4, 4), (3,),),
+    ('pixel_unshuffle', (1, 1, 12, 12), (3,),),
     ('affine_grid', (S, 2, 3), (torch.Size([S, 1, 7, 7]),),),
     ('pad', (3, 3, 4, 2), ([1, 1],),),
     ('pairwise_distance', (S, S), ((S, S),),),

From af1b636b89a99a7b4520fa9c3edd4c8e30969c6f Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Tue, 22 Dec 2020 21:35:24 -0800
Subject: [PATCH 24/45] [Gradient Compression] Change wait() to value() in some
 callbacks of PowerSGD communication hook (#49709)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49709

Since wait() has already been called in the return statements of the precursor callbacks, no need to wait again.

Original PR issue: Investigate Applying PowerSGD to Communication Hook for Gradient Compression #47202
ghstack-source-id: 119015237

Test Plan:
buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_powerSGD_ddp_comm_hook_nccl
buck test mode/dev-nosan caffe2/test/distributed:distributed_nccl_fork -- test_DistributedDataParallel_powerSGD_ddp_comm_hook

Reviewed By: rohan-varma

Differential Revision: D25672068

fbshipit-source-id: da136327db4c4c0e3b846ba8d6885629f1044374
---
 torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
index 627efac00c4b..d3a402801f28 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -281,7 +281,7 @@ def unpack_rank1_tensors_and_allreduce_ps(fut):
         ]
 
     def compute_qs(fut):
-        state.p_memory_dict[bucket_index] = fut.wait()[0]
+        state.p_memory_dict[bucket_index] = fut.value()[0]
         for p in ps:
             _orthogonalize(p)
 
@@ -299,7 +299,7 @@ def compute_qs(fut):
         ]
 
     def decompress(fut):
-        state.q_memory_dict[bucket_index] = fut.wait()[0].div_(world_size)
+        state.q_memory_dict[bucket_index] = fut.value()[0].div_(world_size)
 
         for p, q, tensor in zip(ps, qs, high_rank_tensors):
             torch.matmul(p, q.t(), out=tensor)

From f474ffa1a90f1b5848390e9a96bb17d5b727189b Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 22 Dec 2020 21:56:48 -0800
Subject: [PATCH 25/45] [quant][graphmode][fx] Change standalone module api
 (#49719)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49719

We find there are multiple use cases for standalone module, one use case requires standalone module
to produce a module that takes float Tensor as input and outputs a float Tensor, the other needs to
produce a modulee that takes quantized Tensor as input and outputs a quantized Tensor.

This is similar to `quantized_input_idxs` and `quantized_output_idxs` so we want to nest
prepare_custom_config_dict in the standalone module configuration, for maximum flxibility we also
include qconfig_dict for stand alone module as well in case user needs to have special qconfig_dict for
the standalone module in the future.

Changed from
```python
prepare_custom_config_dict =
{
  "standalone_module_name": ["standalone_module"],
   "standalone_module_class": [StandaloneModule]
 }
```
to
```python
prepare_custom_config_dict =
{
  "standalone_module_name": [("standalone_module", qconfig_dict1, prepare_custom_config_dict1)],
  "standalone_module_class": [(StandaloneModule, qconfig_dict2, prepare_custom_config_dict2)]
 }
```
The entries in the config are:
1. name/module_class
2. optional qconfig_dict, when it is None, we'll use {"": qconfig} where qconfig is the one from parent qconfig_dict
3. optional prepare_custom_config_dict, when it is None, we'll use default value of prepare_custom_config_dict for prepare API (None)

Test Plan:
python test/test_quantization.py TestQuantizeFx.test_standalone_module

Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D25675704

fbshipit-source-id: 0889f519a3e55a7a677f0e2db4db9a18d87a93d4
---
 test/quantization/test_quantize_fx.py |  4 +--
 torch/quantization/fx/quantize.py     | 36 +++++++++++++++++----------
 torch/quantization/quantize_fx.py     | 20 ++++++++++-----
 3 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 7b7b5ffb83a0..66324f928f04 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -611,8 +611,8 @@ def forward(self, x):
         original_ref_m.conv2.bias = torch.nn.Parameter(original_m.standalone.conv.bias.detach())
 
         qconfig_dict = {"": default_qconfig}
-        config_name = {"standalone_module_name": ["standalone"]}
-        config_class = {"standalone_module_class": [StandaloneModule]}
+        config_name = {"standalone_module_name": [("standalone", None, None)]}
+        config_class = {"standalone_module_class": [(StandaloneModule, None, None)]}
         for prepare_config in [config_name, config_class]:
             original_m_copy = copy.deepcopy(original_m)
             original_ref_m_copy = copy.deepcopy(original_ref_m)
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index c57b2c02aa86..af9496a66a63 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -124,11 +124,18 @@ def insert_observer_for_special_module(
     elif isinstance(quantize_handler, StandaloneModuleQuantizeHandler):
         # observe standalone module
         standalone_module = modules[node.target]  # type: ignore
+        standalone_module_name_configs = prepare_custom_config_dict.get("standalone_module_name", [])
+        standalone_module_class_configs = prepare_custom_config_dict.get("standalone_module_class", [])
+        class_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_class_configs}
+        name_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_name_configs}
+        config = class_config_map.get(type(standalone_module), (None, None))
+        config = name_config_map.get(node.target, (None, None))
+        standalone_module_qconfig_dict = {"": qconfig} if config[0] is None else config[0]
+        standalone_prepare_config_dict = {} if config[1] is None else config[1]
         prepare = \
             torch.quantization.quantize_fx._prepare_standalone_module_fx  # type: ignore
         observed_standalone_module = \
-            prepare(standalone_module, {"": qconfig})
-        observed_standalone_module.qconfig = qconfig
+            prepare(standalone_module, standalone_module_qconfig_dict, standalone_prepare_config_dict)
         observed_standalone_module = mark_observed_standalone_module(
             observed_standalone_module)
         parent_name, name = _parent_name(node.target)
@@ -395,10 +402,13 @@ def _prepare(self, model: GraphModule, qconfig_dict: Any,
         self._generate_qconfig_map(model, model.graph, qconfig_dict)
 
         # match the patterns that will get quantized
-        standalone_module_names = prepare_custom_config_dict.get(
-            "standalone_module_name", None)
-        standalone_module_classes = prepare_custom_config_dict.get(
-            "standalone_module_class", None)
+        standalone_module_name_configs = prepare_custom_config_dict.get(
+            "standalone_module_name", [])
+        standalone_module_class_configs = prepare_custom_config_dict.get(
+            "standalone_module_class", [])
+
+        standalone_module_names = [config[0] for config in standalone_module_name_configs]
+        standalone_module_classes = [config[0] for config in standalone_module_class_configs]
         custom_module_classes = get_custom_module_class_keys(
             prepare_custom_config_dict, "float_to_observed_custom_module_class")
         assert self.patterns is not None
@@ -754,21 +764,21 @@ def insert_quantize_node(node: Node) -> None:
             root_node, matched, matched_pattern, obj, qconfig = \
                 matches.get(node.name, (None, None, None, None, None))
             if root_node is node:
-                if qconfig is None:
+                is_observed_standalone_module_node = (
+                    node.op == 'call_module' and
+                    is_observed_standalone_module(
+                        self.modules[node.target])  # type: ignore
+                )
+                if qconfig is None and not is_observed_standalone_module_node:
                     result = self.quantized_graph.node_copy(
                         node, load_non_quantized)
                     quantized = False
                 else:
                     assert obj is not None
-                    is_standalone_module_node = (
-                        node.op == 'call_module' and
-                        is_observed_standalone_module(
-                            self.modules[node.target])  # type: ignore
-                    )
                     result = obj.convert(
                         self, node, load_arg, debug=debug,
                         convert_custom_config_dict=convert_custom_config_dict)
-                    if is_standalone_module_node:
+                    if is_observed_standalone_module_node:
                         quantized = False
                     else:
                         quantized = is_output_quantized(node, obj)
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 1bf91125feb2..cba104b8f783 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -81,11 +81,11 @@ def _prepare_fx(model: torch.nn.Module, qconfig_dict: Any,
     # symbolically trace the model
     if not is_standalone_module:
         # standalone module and custom module config are applied in top level module
-        standalone_module_names = prepare_custom_config_dict.get('standalone_module_name', [])
-        skipped_module_names += standalone_module_names
+        standalone_module_name_configs = prepare_custom_config_dict.get("standalone_module_name", [])
+        skipped_module_names += [config[0] for config in standalone_module_name_configs]
 
-        standalone_module_classes = prepare_custom_config_dict.get('standalone_module_class', [])
-        skipped_module_classes += standalone_module_classes
+        standalone_module_class_configs = prepare_custom_config_dict.get("standalone_module_class", [])
+        skipped_module_classes += [config[0] for config in standalone_module_class_configs]
         float_custom_module_classes = get_custom_module_class_keys(
             prepare_custom_config_dict, "float_to_observed_custom_module_class")
         skipped_module_classes += float_custom_module_classes
@@ -178,11 +178,19 @@ def prepare_fx(
         # optional: specify the path for standalone modules
         # These modules are symbolically traced and quantized as one unit
         "standalone_module_name": [
-           "submodule.standalone"
+           # module_name, qconfig_dict, prepare_custom_config_dict
+           ("submodule.standalone",
+            None,  # qconfig_dict for the prepare function called in the submodule,
+                   # None means use qconfig from parent qconfig_dict
+            {"input_quantized_idxs": [], "output_quantized_idxs": []})  # prepare_custom_config_dict
         ],
 
         "standalone_module_class": [
-            StandaloneModule
+            # module_class, qconfig_dict, prepare_custom_config_dict
+            (StandaloneModule,
+             None,  # qconfig_dict for the prepare function called in the submodule,
+                    # None means use qconfig from parent qconfig_dict
+            {"input_quantized_idxs": [0], "output_quantized_idxs": [0]})  # prepare_custom_config_dict
         ],
 
         # user will manually define the corresponding observed

From ee271047b5bcef5993db3582c759aeec966b98e8 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 22 Dec 2020 22:22:11 -0800
Subject: [PATCH 26/45] torch.utils.checkpoint.checkpoint + torch.cuda.amp
 (#49757)

Summary:
Adds a test to orphaned original PR (https://github.com/pytorch/pytorch/pull/40221).

Should fix https://github.com/pytorch/pytorch/issues/49738 and https://github.com/pytorch/pytorch/issues/47183

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49757

Reviewed By: mruberry

Differential Revision: D25689609

Pulled By: ngimel

fbshipit-source-id: 0a6adc11eb98382048ef9a9775e185dcdeff6010
---
 test/test_cuda.py         | 12 ++++++++++++
 torch/utils/checkpoint.py |  3 ++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index c7466cc2b574..cef6d689343a 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -16,6 +16,7 @@
 import torch.cuda.comm as comm
 from torch import multiprocessing as mp
 from torch.nn.parallel import scatter_gather
+from torch.utils.checkpoint import checkpoint_sequential
 from torch._six import inf, nan, container_abcs
 
 from test_torch import AbstractTestCases
@@ -2882,6 +2883,17 @@ def test_autocast_cache_leak(self):
                     out = linear(data)
                 self.assertTrue(first_iter_mem == torch.cuda.memory_allocated())
 
+    def test_autocast_checkpointing(self):
+        model = torch.nn.Sequential(torch.nn.Linear(8, 8),
+                                    torch.nn.Linear(8, 8),
+                                    torch.nn.Linear(8, 8)).cuda()
+        input = torch.rand((8, 8), device="cuda", dtype=torch.float16, requires_grad=True)
+        with torch.cuda.amp.autocast():
+            output = checkpoint_sequential(model, 2, input)
+        self.assertTrue(output.requires_grad)
+        self.assertTrue(output.dtype is torch.float16)
+        output.sum().backward()
+
     @slowTest
     @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
     def test_max_large_axis(self):
diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py
index 4d183d686fae..a31a15907a33 100644
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@@ -59,6 +59,7 @@ def forward(ctx, run_function, preserve_rng_state, *args):
         check_backward_validity(args)
         ctx.run_function = run_function
         ctx.preserve_rng_state = preserve_rng_state
+        ctx.had_autocast_in_fwd = torch.is_autocast_enabled()
         if preserve_rng_state:
             ctx.fwd_cpu_state = torch.get_rng_state()
             # Don't eagerly initialize the cuda context by accident.
@@ -91,7 +92,7 @@ def backward(ctx, *args):
                 if ctx.had_cuda_in_fwd:
                     set_device_states(ctx.fwd_gpu_devices, ctx.fwd_gpu_states)
             detached_inputs = detach_variable(inputs)
-            with torch.enable_grad():
+            with torch.enable_grad(), torch.cuda.amp.autocast(ctx.had_autocast_in_fwd):
                 outputs = ctx.run_function(*detached_inputs)
 
         if isinstance(outputs, torch.Tensor):

From 88c33ff8abf7038112d5985a8900d91c4c441a15 Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Tue, 22 Dec 2020 23:20:04 -0800
Subject: [PATCH 27/45] [Gradient Compression] Explicitly restrict the scope of
 torch.cuda.synchronize to the current device (#49711)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49711

`torch.cuda.synchronize` uses the current device by default. Explicitly specify this device for better readability.

Original PR issue: Investigate Applying PowerSGD to Communication Hook for Gradient Compression #47202
ghstack-source-id: 119017654

Test Plan:
buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_powerSGD_ddp_comm_hook_nccl

buck test mode/dev-nosan caffe2/test/distributed:distributed_nccl_fork -- test_DistributedDataParallel_powerSGD_ddp_comm_hook

Reviewed By: rohan-varma

Differential Revision: D25672267

fbshipit-source-id: 62a2266727a2ea76175f3c438daf20951091c771
---
 torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
index d3a402801f28..e1322ba489ed 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -304,7 +304,7 @@ def decompress(fut):
         for p, q, tensor in zip(ps, qs, high_rank_tensors):
             torch.matmul(p, q.t(), out=tensor)
         if torch.cuda.is_available():
-            torch.cuda.synchronize()
+            torch.cuda.synchronize(device)
 
         if state.use_error_feedback:
             # Memorize the local errors.
@@ -494,7 +494,7 @@ def decompress(fut):
             # Memorize the local errors.
             state.error_dict[bucket_index] = input_tensor_cp - input_tensor
         if torch.cuda.is_available():
-            torch.cuda.synchronize()
+            torch.cuda.synchronize(device)
         if not state.warm_start:
             state.p_memory_dict.clear()
             state.q_memory_dict.clear()

From 55b431b17aba504ae7b75f6f97b4437101e50f38 Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Tue, 22 Dec 2020 23:20:04 -0800
Subject: [PATCH 28/45] [Gradient Compression] Directly let world_size =
 group_to_use.size() (#49715)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49715

Address the comment on https://github.com/pytorch/pytorch/pull/49417#discussion_r545388351
ghstack-source-id: 119049598

Test Plan: waitforbuildbot

Reviewed By: rohan-varma

Differential Revision: D25673997

fbshipit-source-id: 44eb2540e5a77331c34ba503285cbd0bd63c2c0a
---
 .../algorithms/ddp_comm_hooks/default_hooks.py       | 12 +++---------
 .../algorithms/ddp_comm_hooks/powerSGD_hook.py       |  8 ++------
 .../algorithms/ddp_comm_hooks/quantization_hooks.py  |  8 ++------
 3 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
index 4e6cbd72aee6..59491a868be4 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
@@ -18,9 +18,7 @@ def allreduce_hook(
         >>> ddp_model.register_comm_hook(process_group, allreduce_hook)
     """
     group_to_use = process_group if process_group is not None else dist.group.WORLD
-    world_size = (
-        process_group.size() if process_group is not None else dist.get_world_size()
-    )
+    world_size = group_to_use.size()
 
     tensor = bucket.get_tensors()[0]
     fut = dist.all_reduce(tensor, group=group_to_use, async_op=True).get_future()
@@ -46,9 +44,7 @@ def fp16_compress_hook(
         >>> ddp_model.register_comm_hook(process_group, fp16_compress_hook)
     """
     group_to_use = process_group if process_group is not None else dist.group.WORLD
-    world_size = (
-        process_group.size() if process_group is not None else dist.get_world_size()
-    )
+    world_size = group_to_use.size()
 
     compressed_tensor = bucket.get_tensors()[0].to(torch.float16)
 
@@ -100,9 +96,7 @@ def _allgather_then_aggregate_hook(
     """
     group_to_use = process_group if process_group is not None else dist.group.WORLD
     rank = process_group.rank() if process_group is not None else dist.get_rank()
-    world_size = (
-        process_group.size() if process_group is not None else dist.get_world_size()
-    )
+    world_size = group_to_use.size()
 
     tensor = bucket.get_tensors()[0]
     fut = dist.all_gather(
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
index e1322ba489ed..590ce2054c03 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -126,9 +126,7 @@ def powerSGD_hook(
     """
     process_group = state.process_group
     group_to_use = process_group if process_group is not None else dist.group.WORLD
-    world_size = (
-        process_group.size() if process_group is not None else dist.get_world_size()
-    )
+    world_size = group_to_use.size()
 
     # The input tensor is a flattened 1D tensor.
     input_tensor = bucket.get_tensors()[0]
@@ -363,9 +361,7 @@ def batched_powerSGD_hook(
     """
     process_group = state.process_group
     group_to_use = process_group if process_group is not None else dist.group.WORLD
-    world_size = (
-        process_group.size() if process_group is not None else dist.get_world_size()
-    )
+    world_size = group_to_use.size()
 
     # The input tensor is a flattened 1D tensor.
     input_tensor = bucket.get_tensors()[0]
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
index cde3b79fc7ce..87ee4145bdee 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
@@ -63,9 +63,7 @@ def quantization_pertensor_hook(
     """
     group_to_use = process_group if process_group is not None else dist.group.WORLD
     rank = process_group.rank() if process_group is not None else dist.get_rank()
-    world_size = (
-        process_group.size() if process_group is not None else dist.get_world_size()
-    )
+    world_size = group_to_use.size()
 
     tensor = bucket.get_tensors()[0]
 
@@ -144,9 +142,7 @@ def quantization_perchannel_hook(
     """
     group_to_use = process_group if process_group is not None else dist.group.WORLD
     rank = process_group.rank() if process_group is not None else dist.get_rank()
-    world_size = (
-        process_group.size() if process_group is not None else dist.get_world_size()
-    )
+    world_size = group_to_use.size()
 
     tensor = bucket.get_tensors()[0]
 

From 5171bd94d7b39ad683e1b6b4f883e85a919175a7 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 23 Dec 2020 09:15:44 -0800
Subject: [PATCH 29/45] [lint doc] how to fix flake errors if pre-commit hook
 wasn't there (#49345)

Summary:
This PR adds instructions on what to do if one committed into a PR branch w/o having a pre-commit hook enabled and having CI report flake8 errors.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49345

Reviewed By: cpuhrsch

Differential Revision: D25683167

Pulled By: soumith

fbshipit-source-id: 3c45c866e1636c116d2cacec438d62c860e6b854
---
 CONTRIBUTING.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0933180373e6..c55240d7011a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -903,6 +903,16 @@ You'll need to install an appropriately configured flake8; see
 [Lint as you type](https://github.com/pytorch/pytorch/wiki/Lint-as-you-type)
 for documentation on how to do this.
 
+If you haven't set up the pre-commit hook and have already committed files and 
+CI reports `flake8` errors, you can run the check locally in your PR branch with:
+
+  ```bash
+  flake8 $(git diff --name-only $(git merge-base --fork-point master))
+  ```
+
+fix the code so that no errors are reported when you re-run the above check again, 
+and then commit the fix.
+
 ## Building PyTorch with ASAN
 
 [ASAN](https://github.com/google/sanitizers/wiki/AddressSanitizer) is very

From 370350c749c2c6b09f783d74a8b3e4c233c8b135 Mon Sep 17 00:00:00 2001
From: Akshit Khurana <axit@fb.com>
Date: Wed, 23 Dec 2020 10:56:20 -0800
Subject: [PATCH 30/45] Preserve memory format in qconv op (#49533)

Summary:
* qconv used to return NHWC no matter the input format
* this change returns NCHW format if the input was NCHW

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49533

Test Plan:
pytest test/quantization/test_quantized_op.py::\
TestQuantizedConv::test_qconv2d_preserve_mem_format

Fixes https://github.com/pytorch/pytorch/issues/47295

Reviewed By: kimishpatel

Differential Revision: D25609205

Pulled By: axitkhurana

fbshipit-source-id: 83f8ca4a1496a8a4612fc3da082d727ead257ce7
---
 aten/src/ATen/native/quantized/cpu/qconv.cpp |  2 +-
 test/quantization/test_quantized_op.py       | 32 ++++++++++++++------
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 05762bfb036f..b7d893ad55fc 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -746,7 +746,7 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
       run_status == pytorch_qnnp_status_success,
       "failed to run quantized::conv2d (qnnpack) operator");
 
-  return output;
+  return output.contiguous(act.suggest_memory_format());
 }
 
 template at::Tensor PackedConvWeightsQnnp<2>::apply(
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index 646c5740589d..c676ccc0f793 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -3346,7 +3346,7 @@ def _make_qconv_tensors(
         self, batch_size, input_channels_per_group, input_feature_map_shape,
         output_channels_per_group, groups, kernels, strides, pads, dilations,
         X_scale, X_zero_point, W_scale, W_zero_point,
-        use_bias, use_channelwise, use_transpose
+        use_bias, use_channelwise, use_transpose, memory_format=torch.contiguous_format
     ):
         assert not (use_channelwise and use_transpose), \
                "Cannot generate channelwise qconv_transpose_tensors "
@@ -3394,6 +3394,7 @@ def _make_qconv_tensors(
             (batch_size, input_channels,) + input_feature_map_shape,
         )
         X = X_scale * (X_init - X_zero_point).float()
+        X = X.to(memory_format=memory_format)
 
         if use_channelwise:
             W_shape = (-1, 1) + (1,) * len(kernels)
@@ -3426,13 +3427,15 @@ def _test_qconv_impl(
         input_channels_per_group, input_feature_map_shape,
         output_channels_per_group, groups, kernels, strides, pads, o_pads,
         dilations, X_scale, X_zero_point, W_scale, W_zero_point, Y_scale,
-        Y_zero_point, use_bias, use_relu, use_channelwise, use_transpose
+        Y_zero_point, use_bias, use_relu, use_channelwise, use_transpose,
+        memory_format=torch.contiguous_format
     ):
         (X, W), (X_q, W_q), bias_float = self._make_qconv_tensors(
             batch_size, input_channels_per_group, input_feature_map_shape,
             output_channels_per_group, groups, kernels,
             strides, pads, dilations, X_scale, X_zero_point, W_scale,
-            W_zero_point, use_bias, use_channelwise, use_transpose)
+            W_zero_point, use_bias, use_channelwise, use_transpose,
+            memory_format)
         # Assign weights
         W = W_q.dequantize()
         X = X_q.dequantize()
@@ -3480,6 +3483,14 @@ def _test_qconv_impl(
             pads: {pads}, o_pads: {o_pads}, dilations: {dilations},
             groups: {groups}, y_s: {Y_scale}, y_zp: {Y_zero_point}''')
 
+        # fbgemm for now forces output to be NHWC (channels last) to opportunistically
+        # improve performance
+        if torch.backends.quantized.engine == 'qnnpack':
+            # Make sure memory format is preserved
+            self.assertEqual(
+                X_q.is_contiguous(memory_format=memory_format),
+                Y_q.is_contiguous(memory_format=memory_format))
+
         # Return the quantized data for later reuse
         return X_q, W_q, bias_float
 
@@ -3552,12 +3563,14 @@ def test_qconv2d(
             dilations,
             groups,
         )
-        self._test_qconv_impl(
-            qconv, qconv_prepack, conv_op, batch_size,
-            input_channels_per_group, (height, width),
-            output_channels_per_group, groups, kernels, strides, pads, None,
-            dilations, X_scale, X_zero_point, W_scale, W_zero_point,
-            Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False)
+        for memory_format in (torch.contiguous_format, torch.channels_last):
+            self._test_qconv_impl(
+                qconv, qconv_prepack, conv_op, batch_size,
+                input_channels_per_group, (height, width),
+                output_channels_per_group, groups, kernels, strides, pads, None,
+                dilations, X_scale, X_zero_point, W_scale, W_zero_point,
+                Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False,
+                memory_format)
 
     """Tests the correctness of quantized convolution op."""
     @given(batch_size=st.integers(1, 3),
@@ -4150,6 +4163,7 @@ def test_qconv3d_unpack(
             (stride_d, stride_h, stride_w), (pad_d, pad_h, pad_w), (o_pad, o_pad, o_pad),
             channelwise)
 
+
 class TestPadding(TestCase):
     @given(batch_size=st.integers(1, 64),
            channels=st.integers(1, 64),

From 8554b58fbdd865c760d92bfa50c1119cc8fc65e9 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Wed, 23 Dec 2020 11:27:13 -0800
Subject: [PATCH 31/45] Added linalg.inv (#48261)

Summary:
This PR adds `torch.linalg.inv` for NumPy compatibility.

`linalg_inv_out` uses in-place operations on provided `result` tensor.

I modified `apply_inverse` to accept tensor of Int instead of std::vector, that way we can write a function similar to `linalg_inv_out` but removing the error checks and device memory synchronization.

I fixed `lda` (leading dimension parameter which is max(1, n)) in many places to handle 0x0 matrices correctly.
Zero batch dimensions are also working and tested.

Ref https://github.com/pytorch/pytorch/issues/42666

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48261

Reviewed By: ngimel

Differential Revision: D25690129

Pulled By: mruberry

fbshipit-source-id: edb2d03721f22168c42ded8458513cb23dfdc712
---
 aten/src/ATen/cuda/CUDABlas.cpp               |  22 ++--
 aten/src/ATen/cuda/CUDABlas.h                 |   2 +-
 aten/src/ATen/native/BatchLinearAlgebra.cpp   | 110 ++++++++++++++---
 .../ATen/native/cuda/BatchLinearAlgebra.cu    | 109 ++++++++++++-----
 .../ATen/native/cuda/BatchLinearAlgebraLib.cu |  85 +++++++++----
 .../ATen/native/cuda/BatchLinearAlgebraLib.h  |   1 +
 aten/src/ATen/native/cuda/LinearAlgebra.cu    |   4 +-
 aten/src/ATen/native/native_functions.yaml    |  21 ++++
 docs/source/linalg.rst                        |   1 +
 test/test_linalg.py                           | 113 +++++++++++++-----
 tools/autograd/derivatives.yaml               |   3 +
 tools/autograd/gen_variable_type.py           |   2 +-
 torch/csrc/api/include/torch/linalg.h         |  19 +++
 torch/linalg/__init__.py                      |  59 +++++++++
 torch/overrides.py                            |   1 +
 .../_internal/common_methods_invocations.py   |  33 +++++
 16 files changed, 464 insertions(+), 121 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 8c32c8db1a1c..0521adf669c5 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -831,18 +831,18 @@ void getrfBatched<c10::complex<float>>(
 
 template <>
 void getriBatched<double>(
-    int n, double** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize, double** dC_array) {
+    int n, double** dA_array, int ldda, int* ipiv_array, double** dC_array, int lddc, int* info_array, int batchsize) {
   auto handle = at::cuda::getCurrentCUDABlasHandle();
   TORCH_CUDABLAS_CHECK(cublasDgetriBatched(
-      handle, n, dA_array, ldda, ipiv_array, dC_array, n, info_array, batchsize));
+      handle, n, dA_array, ldda, ipiv_array, dC_array, lddc, info_array, batchsize));
 }
 
 template <>
 void getriBatched<float>(
-    int n, float** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize, float** dC_array) {
+    int n, float** dA_array, int ldda, int* ipiv_array, float** dC_array, int lddc, int* info_array, int batchsize) {
   auto handle = at::cuda::getCurrentCUDABlasHandle();
   TORCH_CUDABLAS_CHECK(cublasSgetriBatched(
-      handle, n, dA_array, ldda, ipiv_array, dC_array, n, info_array, batchsize));
+      handle, n, dA_array, ldda, ipiv_array, dC_array, lddc, info_array, batchsize));
 }
 
 template <>
@@ -851,9 +851,10 @@ void getriBatched<c10::complex<double>>(
     c10::complex<double>** dA_array,
     int ldda,
     int* ipiv_array,
+    c10::complex<double>** dC_array,
+    int lddc,
     int* info_array,
-    int batchsize,
-    c10::complex<double>** dC_array) {
+    int batchsize) {
   auto handle = at::cuda::getCurrentCUDABlasHandle();
   TORCH_CUDABLAS_CHECK(cublasZgetriBatched(
       handle,
@@ -862,7 +863,7 @@ void getriBatched<c10::complex<double>>(
       ldda,
       ipiv_array,
       reinterpret_cast<cuDoubleComplex**>(dC_array),
-      n,
+      lddc,
       info_array,
       batchsize));
 }
@@ -873,9 +874,10 @@ void getriBatched<c10::complex<float>>(
     c10::complex<float>** dA_array,
     int ldda,
     int* ipiv_array,
+    c10::complex<float>** dC_array,
+    int lddc,
     int* info_array,
-    int batchsize,
-    c10::complex<float>** dC_array) {
+    int batchsize) {
   auto handle = at::cuda::getCurrentCUDABlasHandle();
   TORCH_CUDABLAS_CHECK(cublasCgetriBatched(
       handle,
@@ -884,7 +886,7 @@ void getriBatched<c10::complex<float>>(
       ldda,
       ipiv_array,
       reinterpret_cast<cuComplex**>(dC_array),
-      n,
+      lddc,
       info_array,
       batchsize));
 }
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
index 93a0ff588dda..d44fc49c589a 100644
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@@ -175,7 +175,7 @@ void getrfBatched<c10::complex<float>>(CUDABLAS_GETRF_ARGTYPES(c10::complex<floa
 
 
 #define CUDABLAS_GETRI_ARGTYPES(Dtype)  \
-  int n, Dtype** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize, Dtype** dC_array
+  int n, Dtype** dA_array, int ldda, int* ipiv_array, Dtype** dC_array, int lddc, int* info_array, int batchsize
 
 template<class Dtype>
 void getriBatched(CUDABLAS_GETRI_ARGTYPES(Dtype)) {
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index cc1403febf90..7bbdd8072a11 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -528,8 +528,15 @@ Tensor linalg_solve(const Tensor& input, const Tensor& other) {
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ inverse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+/*
+Computes the inverse of n-by-n matrix 'self'
+This is an in-place routine, content of 'self' is overriden.
+'infos_lu' and 'infos_getri' are int Tensors containing error codes for each matrix in the batched input.
+'infos_lu' is for holding lapackLU errors, and 'infos_getri' is for holding lapackGetri errors.
+For more information see LAPACK's documentation for GETRI and GETRF routines.
+*/
 template <typename scalar_t>
-static void apply_inverse(Tensor& self, std::vector<int64_t>& infos) {
+static void apply_inverse(Tensor& self, Tensor& infos_lu, Tensor& infos_getri) {
 #ifndef USE_LAPACK
   AT_ERROR("inverse: LAPACK library not found in compilation");
 #else
@@ -538,9 +545,12 @@ static void apply_inverse(Tensor& self, std::vector<int64_t>& infos) {
   auto self_matrix_stride = matrixStride(self);
   auto batch_size = batchCount(self);
   auto n = self.size(-2);
+  auto lda = std::max<int64_t>(1, n);
 
-  auto ipiv = at::empty({n}, self.options().dtype(kInt));
+  auto ipiv = at::empty({lda}, self.options().dtype(kInt));
   auto ipiv_data = ipiv.data_ptr<int>();
+  auto infos_lu_data = infos_lu.data_ptr<int>();
+  auto infos_getri_data = infos_getri.data_ptr<int>();
 
   int info;
   // Run once, first to get the optimum work size
@@ -549,39 +559,36 @@ static void apply_inverse(Tensor& self, std::vector<int64_t>& infos) {
   // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
   int lwork = -1;
   scalar_t wkopt;
-  lapackGetri<scalar_t>(n, self_data, n, ipiv_data, &wkopt, lwork, &info);
+  lapackGetri<scalar_t>(n, self_data, lda, ipiv_data, &wkopt, lwork, &info);
   lwork = static_cast<int>(real_impl<scalar_t, value_t>(wkopt));
   Tensor work = at::empty({lwork}, self.options());
   auto work_data = work.data_ptr<scalar_t>();
 
   for (int64_t i = 0; i < batch_size; i++) {
     scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
-    lapackLu<scalar_t>(n, n, self_working_ptr, n, ipiv_data, &info);
-    infos[i] = info;
-    if (info != 0) {
-      return;
-    }
+    int* info_lu_working_ptr = &infos_lu_data[i];
+    lapackLu<scalar_t>(n, n, self_working_ptr, lda, ipiv_data, info_lu_working_ptr);
 
     // now compute the actual inverse
-    lapackGetri<scalar_t>(n, self_working_ptr, n, ipiv_data, work_data, lwork, &info);
-    infos[i] = info;
-    if (info != 0) {
-      return;
-    }
+    int* info_getri_working_ptr = &infos_getri_data[i];
+    lapackGetri<scalar_t>(n, self_working_ptr, lda, ipiv_data, work_data, lwork, info_getri_working_ptr);
   }
 #endif
 }
 
 Tensor _inverse_helper_cpu(const Tensor& self) {
-  std::vector<int64_t> infos(batchCount(self), 0);
+  auto infos_lu = at::empty({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
+  auto infos_getri = at::empty({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
   auto self_working_copy = cloneBatchedColumnMajor(self);
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "inverse_cpu", [&]{
-    apply_inverse<scalar_t>(self_working_copy, infos);
+    apply_inverse<scalar_t>(self_working_copy, infos_lu, infos_getri);
   });
   if (self.dim() > 2) {
-    batchCheckErrors(infos, "inverse_cpu");
+    batchCheckErrors(infos_lu, "inverse_cpu");
+    batchCheckErrors(infos_getri, "inverse_cpu");
   } else {
-    singleCheckErrors(infos[0], "inverse_cpu");
+    singleCheckErrors(infos_lu.item().toInt(), "inverse_cpu");
+    singleCheckErrors(infos_getri.item().toInt(), "inverse_cpu");
   }
   return self_working_copy;
 }
@@ -602,6 +609,75 @@ Tensor& inverse_out(Tensor &result, const Tensor &self) {
   return result;
 }
 
+// This is a type dispatching helper function for 'apply_inverse'
+Tensor& _linalg_inv_out_helper_cpu(Tensor &result, Tensor& infos_lu, Tensor& infos_getri) {
+  // This function calculates the inverse matrix in-place
+  // result should be in column major order and contain matrices to invert
+  // the content of result is overriden by 'apply_inverse'
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cpu", [&]{
+    apply_inverse<scalar_t>(result, infos_lu, infos_getri);
+  });
+  return result;
+}
+
+// Computes the inverse matrix of 'input', it is is saved to 'result' in-place
+// LAPACK/MAGMA/cuSOLVER error codes are saved in 'infos' tensors, they are not checked here
+static Tensor& linalg_inv_out_info(Tensor& result, Tensor& infos_lu, Tensor& infos_getri, const Tensor& input) {
+  squareCheckInputs(input);
+  TORCH_INTERNAL_ASSERT(infos_lu.scalar_type() == kInt);
+  TORCH_INTERNAL_ASSERT(infos_getri.scalar_type() == kInt);
+  TORCH_CHECK(result.scalar_type() == input.scalar_type(),
+    "result dtype ", result.scalar_type(), " does not match input dtype ", input.scalar_type());
+  TORCH_CHECK(result.device() == input.device(),
+    "result device ", result.device(), " does not match input device ", input.device());
+
+  // if result has no elements we can modify it
+  if (result.numel() == 0) {
+    at::native::resize_as_(result, input.transpose(-2, -1), MemoryFormat::Contiguous);
+    result.transpose_(-2, -1);
+  } else {
+    // Resize messes up the strides and we expect strictly column major order, so let's not use at::native::resize_output
+    TORCH_CHECK(result.sizes().equals(input.sizes()),
+    "result shape ", result.sizes(), " does not match input shape ", input.sizes());
+  }
+
+  TORCH_CHECK(result.transpose(-2, -1).is_contiguous(), "result tensor must be in batched column major order (Fortran contiguous).");
+  result.copy_(input);
+
+  at::native::resize_output(infos_lu, {std::max<int64_t>(1, batchCount(input))});
+  at::native::resize_output(infos_getri, {std::max<int64_t>(1, batchCount(input))});
+  infos_lu.fill_(0);
+  infos_getri.fill_(0);
+
+  result = at::_linalg_inv_out_helper_(result, infos_lu, infos_getri);
+  return result;
+}
+
+// Computes the inverse matrix of 'input', it is is saved to 'result' in-place
+Tensor& linalg_inv_out(Tensor &result, const Tensor &input) {
+  auto infos_lu = at::empty({0}, input.options().dtype(kInt));
+  auto infos_getri = at::empty({0}, input.options().dtype(kInt));
+  result = linalg_inv_out_info(result, infos_lu, infos_getri, input);
+
+  // Now check LAPACK/MAGMA/cuSOLVER error codes
+  if (result.dim() > 2) {
+    batchCheckErrors(infos_lu, "linalg_inv_lu");
+    batchCheckErrors(infos_getri, "linalg_inv_getri");
+  } else {
+    singleCheckErrors(infos_lu.item().toInt(), "linalg_inv_lu");
+    singleCheckErrors(infos_getri.item().toInt(), "linalg_inv_getri");
+  }
+
+  return result;
+}
+
+// Computes the inverse matrix of 'input'
+Tensor linalg_inv(const Tensor &input) {
+  Tensor result = at::empty({0}, input.options());
+  result = at::linalg_inv_out(result, input);
+  return result;
+}
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cholesky_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template<typename scalar_t>
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index e5804ba389c5..252ab57048be 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -1227,8 +1227,14 @@ Tensor& _linalg_solve_out_helper_cuda(Tensor& result, Tensor& input, Tensor& inf
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ inverse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+/*
+Computes the inverse of n-by-n matrix 'self', it is saved to 'self_inv'.
+'infos' is an int Tensor containing error codes for each matrix in the batched input.
+'infos_lu' is for holding magmaLU errors, and 'infos_getri' is for holding magmaGetri errors
+For more information see MAGMA's documentation for GETRI and GETRF routines.
+*/
 template <typename scalar_t>
-static void apply_batched_inverse(Tensor& self, Tensor& self_inv, std::vector<int64_t>& infos) {
+static void apply_batched_inverse(Tensor& self, Tensor& self_inv, Tensor& infos_lu, Tensor& infos_getri) {
 #ifndef USE_MAGMA
 AT_ERROR("inverse: MAGMA library not found in "
     "compilation. Please rebuild with MAGMA.");
@@ -1238,20 +1244,24 @@ AT_ERROR("inverse: MAGMA library not found in "
   auto self_inv_data = self_inv.data_ptr<scalar_t>();
   auto self_inv_mat_stride = matrixStride(self_inv);
 
+  auto infos_lu_data = infos_lu.data_ptr<magma_int_t>();
+  auto infos_getri_data = infos_getri.data_ptr<magma_int_t>();
+
   magma_int_t batch_size = magma_int_cast(batchCount(self), "batchCount");
   magma_int_t n = magma_int_cast(self.size(-2), "self.size(-2)");
+  magma_int_t lda = std::max<magma_int_t>(1, n);
 
-  magma_int_t* info_array;
   magma_int_t* ipiv_data;
   magma_int_t** ipiv_array;
   scalar_t** self_array;
   scalar_t** self_inv_array;
 
-  ALLOCATE_ARRAY(info_array, magma_int_t, batch_size);
-  ALLOCATE_ARRAY(ipiv_data, magma_int_t, batch_size * n);
-  ALLOCATE_ARRAY(ipiv_array, magma_int_t*, batch_size);
-  ALLOCATE_ARRAY(self_array, scalar_t*, batch_size);
-  ALLOCATE_ARRAY(self_inv_array, scalar_t*, batch_size);
+  magma_int_t batch_size_or_one = std::max<magma_int_t>(1, batch_size);
+
+  ALLOCATE_ARRAY(ipiv_data, magma_int_t, batch_size_or_one * lda);
+  ALLOCATE_ARRAY(ipiv_array, magma_int_t*, batch_size_or_one);
+  ALLOCATE_ARRAY(self_array, scalar_t*, batch_size_or_one);
+  ALLOCATE_ARRAY(self_inv_array, scalar_t*, batch_size_or_one);
 
   // Set up the created arrays
   for (int64_t i = 0; i < batch_size; i++) {
@@ -1262,7 +1272,7 @@ AT_ERROR("inverse: MAGMA library not found in "
 
   MAGMAQueue magma_queue(self.get_device());
   magmaLuBatched<scalar_t>(
-    n, n, self_array, n, ipiv_array, info_array,
+    n, n, self_array, lda, ipiv_array, infos_lu_data,
     batch_size, magma_queue);
 
   constexpr int64_t batch_limit = 65535;
@@ -1274,67 +1284,67 @@ AT_ERROR("inverse: MAGMA library not found in "
     scalar_t** self_array_cur = &self_array[mini_idx];
     scalar_t** self_inv_array_cur = &self_inv_array[mini_idx];
     magma_int_t** ipiv_array_cur = &ipiv_array[mini_idx];
-    magma_int_t* info_array_cur = &info_array[mini_idx];
+    magma_int_t* info_array_cur_getri = &infos_getri_data[mini_idx];
 
     magmaGetriBatched<scalar_t>(
-      n, self_array_cur, n, ipiv_array_cur, self_inv_array_cur,
-      n, info_array_cur, batch_limit, magma_queue);
+      n, self_array_cur, lda, ipiv_array_cur, self_inv_array_cur,
+      lda, info_array_cur_getri, batch_limit, magma_queue);
   }
 
   // Compute whatever is left = batch_size - floor(batch_size / batch_limit) * batch_limit
   // which concisely is equal to batch_size % batch_limit
   if (batch_size % batch_limit != 0) {
     magmaGetriBatched<scalar_t>(
-      n, &self_array[mini_idx], n, &ipiv_array[mini_idx], &self_inv_array[mini_idx],
-      n, &info_array[mini_idx], batch_size % batch_limit, magma_queue);
-  }
-
-  for (int64_t i = 0; i < batch_size; i++) {
-    infos[i] = info_array[i];
+      n, &self_array[mini_idx], lda, &ipiv_array[mini_idx], &self_inv_array[mini_idx],
+      lda, &infos_getri_data[mini_idx], batch_size % batch_limit, magma_queue);
   }
 #endif
 }
 
 template <typename scalar_t>
-static void apply_single_inverse(Tensor& self, int64_t& info) {
+static void apply_single_inverse(Tensor& self, Tensor& infos_lu, Tensor& infos_getri) {
 #ifndef USE_MAGMA
 AT_ERROR("inverse: MAGMA library not found in "
     "compilation. Please rebuild with MAGMA.");
 #else
   auto self_data = self.data_ptr<scalar_t>();
   magma_int_t n = magma_int_cast(self.size(-2), "self.size(-2)");
+  magma_int_t lda = std::max<magma_int_t>(1, n);
   magma_int_t lwork = n * magmaGetriOptimalBlocksize<scalar_t>(n);
-  magma_int_t info_tmp = 0;
 
-  Tensor ipiv = at::empty({n}, at::kInt);
+  // magmaLu and magmaGetri requires infos tensor to live on CPU
+  infos_lu = infos_lu.to(at::kCPU);
+  infos_getri = infos_getri.to(at::kCPU);
+
+  Tensor ipiv = at::empty({lda}, at::kInt);
   Tensor dwork = at::empty({lwork}, self.options());
-  magmaLu<scalar_t>(n, n, self_data, n, ipiv.data_ptr<magma_int_t>(), &info_tmp);
-  if (info_tmp != 0) {
-    info = info_tmp;
-    return;
-  }
+  magmaLu<scalar_t>(n, n, self_data, lda, ipiv.data_ptr<magma_int_t>(), infos_lu.data_ptr<magma_int_t>());
   magmaGetri<scalar_t>(
-    n, self_data, n, ipiv.data_ptr<magma_int_t>(), dwork.data_ptr<scalar_t>(), lwork, &info_tmp);
-  info = info_tmp;
+    n, self_data, lda, ipiv.data_ptr<magma_int_t>(), dwork.data_ptr<scalar_t>(), lwork, infos_getri.data_ptr<magma_int_t>());
 #endif
 }
 
 Tensor _inverse_helper_cuda_legacy(const Tensor& self) {
   auto self_inv_working_copy = cloneBatchedColumnMajor(self);
   if (self.dim() > 2) {
-    std::vector<int64_t> infos(batchCount(self), 0);
+    auto infos_lu = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
+    auto infos_getri = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
     auto self_working_copy = cloneBatchedColumnMajor(self);
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "inverse_cuda", [&]{
       apply_batched_inverse<scalar_t>(
-        self_working_copy, self_inv_working_copy, infos);
+        self_working_copy, self_inv_working_copy, infos_lu, infos_getri);
     });
-    batchCheckErrors(infos, "inverse_cuda");
+    batchCheckErrors(infos_lu, "inverse_cuda");
+    batchCheckErrors(infos_getri, "inverse_cuda");
   } else {
-    int64_t info = 0;
+    // magmaLu and magmaGetri requires infos tensor to live on CPU
+    auto infos_lu = at::zeros({1}, self.options().dtype(kInt).device(kCPU));
+    auto infos_getri = at::zeros({1}, self.options().dtype(kInt).device(kCPU));
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "inverse_cuda", [&]{
-      apply_single_inverse<scalar_t>(self_inv_working_copy, info);
+      apply_single_inverse<scalar_t>(self_inv_working_copy, infos_lu, infos_getri);
     });
-    singleCheckErrors(info, "inverse_cuda");
+    singleCheckErrors(infos_lu.item().toInt(), "inverse_cuda");
+    singleCheckErrors(infos_getri.item().toInt(), "inverse_cuda");
   }
   return self_inv_working_copy;
 }
@@ -1351,6 +1361,39 @@ Tensor _inverse_helper_cuda(const Tensor& self) {
 #endif
 }
 
+// This is a type dispatching helper function for 'apply_batched_inverse' and 'singleCheckErrors'
+Tensor& _linalg_inv_out_helper_cuda_legacy(Tensor& result, Tensor& infos_lu, Tensor& infos_getri) {
+  // assuming result is in column major order and contains the matrices to invert
+  if (result.dim() > 2) {
+    auto input_working_copy = cloneBatchedColumnMajor(result);
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cuda", [&]{
+      apply_batched_inverse<scalar_t>(
+        input_working_copy, result, infos_lu, infos_getri);
+    });
+  } else {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cuda", [&]{
+      apply_single_inverse<scalar_t>(result, infos_lu, infos_getri);
+    });
+  }
+  return result;
+}
+
+// This is a MAGMA/cuSOLVER dispatching helper function
+Tensor& _linalg_inv_out_helper_cuda(Tensor &result, Tensor& infos_lu, Tensor& infos_getri) {
+  // This function calculates the inverse matrix in-place
+  // result should be in column major order and contain matrices to invert
+#ifdef USE_CUSOLVER
+  if ((result.dim() == 2) || (/* result.dim() > 2 && */ batchCount(result) <= 2) || !use_magma_) {
+    return _linalg_inv_out_helper_cuda_lib(result, infos_lu, infos_getri);  // cusolver or cublas
+  } else {
+    return _linalg_inv_out_helper_cuda_legacy(result, infos_lu, infos_getri);  // magma-cuda
+  }
+#else
+  return _linalg_inv_out_helper_cuda_legacy(result, infos_lu, infos_getri);  // magma-cuda
+#endif
+  return result;
+}
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cholesky_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <typename scalar_t>
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu
index 37c360357e82..534f257d55bb 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu
@@ -26,28 +26,31 @@ inline static Tensor column_major_identity_matrix_like(const Tensor& self) {
 }
 
 template <typename scalar_t>
-inline static void _apply_single_inverse_helper(scalar_t* self_ptr, scalar_t* self_inv_ptr, int* ipiv_ptr, int* info_ptr, int n) {
+inline static void _apply_single_inverse_helper(scalar_t* self_ptr, scalar_t* self_inv_ptr, int* ipiv_ptr, int* info_getrf_ptr, int* info_getrs_ptr, int n, int lda) {
   // self_inv_ptr should already be an identity matrix
 
   auto handle = at::cuda::getCurrentCUDASolverDnHandle();
-  at::cuda::solver::getrf<scalar_t>(handle, n, n, self_ptr, n, ipiv_ptr, info_ptr);
-  at::cuda::solver::getrs<scalar_t>(handle, n, n, self_ptr, n, ipiv_ptr, self_inv_ptr, n, info_ptr + 1);
+  at::cuda::solver::getrf<scalar_t>(handle, n, n, self_ptr, lda, ipiv_ptr, info_getrf_ptr);
+  at::cuda::solver::getrs<scalar_t>(handle, n, n, self_ptr, lda, ipiv_ptr, self_inv_ptr, lda, info_getrs_ptr);
 }
 
 template <typename scalar_t>
-static void apply_batched_inverse_lib(Tensor& self, Tensor& self_inv, Tensor& infos) {
+static void apply_batched_inverse_lib(Tensor& self, Tensor& self_inv, Tensor& infos_getrf, Tensor& infos_getrs) {
   const int batch_size = cuda_int_cast(batchCount(self), "batchCount");
   const int n = cuda_int_cast(self.size(-2), "self.size(-2)");
+  const int lda = std::max<int>(1, n);
 
   auto self_data = self.data_ptr<scalar_t>();
   auto self_mat_stride = matrixStride(self);
   auto self_inv_data = self_inv.data_ptr<scalar_t>();
   auto self_inv_mat_stride = matrixStride(self_inv);
 
+  auto infos_getrf_data = infos_getrf.data_ptr<int>();
+  auto infos_getrs_data = infos_getrs.data_ptr<int>();
+
   auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
 
   if (use_loop_launch(batch_size, n)) {
-    int* p_infos = infos.data_ptr<int>();
     auto main_stream = at::cuda::getCurrentCUDAStream();
 
     at::cuda::CUDAEvent main_event;
@@ -59,10 +62,14 @@ static void apply_batched_inverse_lib(Tensor& self, Tensor& self_inv, Tensor& in
 
       main_event.block(stream);
 
-      auto dataPtr = allocator.allocate(sizeof(int) * n);
+      auto dataPtr = allocator.allocate(sizeof(int) * lda);
       int* pivot = reinterpret_cast<int*>(dataPtr.get());
+
+      int* infos_getrf_working_ptr = &infos_getrf_data[i];
+      int* infos_getrs_working_ptr = &infos_getrs_data[i];
+
       _apply_single_inverse_helper<scalar_t>(
-        &self_data[i * self_mat_stride], &self_inv_data[i * self_inv_mat_stride], pivot, p_infos + i * 2, n);
+        &self_data[i * self_mat_stride], &self_inv_data[i * self_inv_mat_stride], pivot, infos_getrf_working_ptr, infos_getrs_working_ptr, n, lda);
 
       at::cuda::CUDAEvent finished;
       finished.record(stream);
@@ -79,30 +86,52 @@ static void apply_batched_inverse_lib(Tensor& self, Tensor& self_inv, Tensor& in
       reinterpret_cast<long>(&self_inv_data[(batch_size-1) * self_inv_mat_stride]) + 1,
       static_cast<long>(self_inv_mat_stride * sizeof(scalar_t)), self.options().dtype(at::kLong));
 
-    auto dataPtr = allocator.allocate(sizeof(int)*batch_size*n);
+    auto dataPtr = allocator.allocate(sizeof(int)*batch_size*lda);
     int* ipiv_array = reinterpret_cast<int*>(dataPtr.get());
 
-    Tensor _info1 = at::zeros({batch_size}, self.options().dtype(at::kInt));
-    Tensor _info2 = at::zeros({batch_size}, self.options().dtype(at::kInt));
-
-    at::cuda::blas::getrfBatched<scalar_t>(n, reinterpret_cast<scalar_t**>(self_array.data_ptr()), n,
-      ipiv_array, _info1.data_ptr<int>(), batch_size);
+    at::cuda::blas::getrfBatched<scalar_t>(n, reinterpret_cast<scalar_t**>(self_array.data_ptr()), lda,
+      ipiv_array, infos_getrf_data, batch_size);
 
-    at::cuda::blas::getriBatched<scalar_t>(n, reinterpret_cast<scalar_t**>(self_array.data_ptr()), n,
-      ipiv_array, _info2.data_ptr<int>(), batch_size, reinterpret_cast<scalar_t**>(self_inv_array.data_ptr()));
-
-    infos = at::stack({_info1, _info2}, 1);
+    at::cuda::blas::getriBatched<scalar_t>(n, reinterpret_cast<scalar_t**>(self_array.data_ptr()), lda,
+      ipiv_array, reinterpret_cast<scalar_t**>(self_inv_array.data_ptr()), lda, infos_getrs_data, batch_size);
   }
 }
 
 template <typename scalar_t>
-static void apply_single_inverse_lib(const Tensor& self, Tensor& self_inv, Tensor& info) {
+static void apply_single_inverse_lib(const Tensor& self, Tensor& self_inv, Tensor& infos_getrf, Tensor& infos_getrs) {
   int n = cuda_int_cast(self.size(-2), "self.size(-2)");
+  int lda = std::max<int>(1, n);
 
-  Tensor ipiv = at::empty({n}, self.options().dtype(at::kInt));
+  Tensor ipiv = at::empty({lda}, self.options().dtype(at::kInt));
 
   _apply_single_inverse_helper<scalar_t>(
-    self.data_ptr<scalar_t>(), self_inv.data_ptr<scalar_t>(), ipiv.data_ptr<int>(), info.data_ptr<int>(), n);
+    self.data_ptr<scalar_t>(), self_inv.data_ptr<scalar_t>(), ipiv.data_ptr<int>(), infos_getrf.data_ptr<int>(), infos_getrs.data_ptr<int>(), n, lda);
+}
+
+// This is a type dispatching helper function for 'apply_batched_inverse_lib' and 'apply_single_inverse_lib'
+Tensor& _linalg_inv_out_helper_cuda_lib(Tensor& result, Tensor& infos_getrf, Tensor& infos_getrs) {
+  // assuming result is in column major order and contains the matrices to invert
+  Tensor input_working_copy = cloneBatchedColumnMajor(result);
+
+  // for getrf + getrs (cusolver path)
+  // result should be filled with identity matrices
+  result.zero_();
+  result.diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).fill_(1);
+
+  const int batch_size = cuda_int_cast(batchCount(result), "batchCount");
+
+  if (result.dim() > 2) {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cuda", [&]{
+      apply_batched_inverse_lib<scalar_t>(
+        input_working_copy, result, infos_getrf, infos_getrs);
+    });
+  } else {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cuda", [&]{
+      apply_single_inverse_lib<scalar_t>(input_working_copy, result, infos_getrf, infos_getrs);
+    });
+  }
+
+  return result;
 }
 
 Tensor _inverse_helper_cuda_lib(const Tensor& self) {
@@ -111,18 +140,22 @@ Tensor _inverse_helper_cuda_lib(const Tensor& self) {
   const int batch_size = cuda_int_cast(batchCount(self), "batchCount");
 
   if (self.dim() > 2 && batch_size > 1) {
-    Tensor infos = at::zeros({batchCount(self) * 2}, self.options().dtype(kInt));
+    Tensor infos_getrf = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
+    Tensor infos_getrs = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "inverse_cuda", [&]{
       apply_batched_inverse_lib<scalar_t>(
-        self_working_copy, self_inv_working_copy, infos);
+        self_working_copy, self_inv_working_copy, infos_getrf, infos_getrs);
     });
-    batchCheckErrors(infos, "inverse_cuda", false, 2);
+    batchCheckErrors(infos_getrf, "inverse_cuda");
+    batchCheckErrors(infos_getrs, "inverse_cuda");
   } else {
-    Tensor info = at::zeros({2}, self.options().dtype(at::kInt));
+    Tensor infos_getrf = at::zeros({1}, self.options().dtype(kInt));
+    Tensor infos_getrs = at::zeros({1}, self.options().dtype(kInt));
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "inverse_cuda", [&]{
-      apply_single_inverse_lib<scalar_t>(self_working_copy, self_inv_working_copy, info);
+      apply_single_inverse_lib<scalar_t>(self_working_copy, self_inv_working_copy, infos_getrf, infos_getrs);
     });
-    batchCheckErrors(info, "inverse_cuda", false, 2);
+    batchCheckErrors(infos_getrf, "inverse_cuda");
+    batchCheckErrors(infos_getrs, "inverse_cuda");
   }
 
   return self_inv_working_copy;
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
index dc6dc2f9daca..2be18137a64f 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
@@ -18,6 +18,7 @@ namespace at {
 namespace native {
 
 Tensor _inverse_helper_cuda_lib(const Tensor& self);
+Tensor& _linalg_inv_out_helper_cuda_lib(Tensor& result, Tensor& infos_getrf, Tensor& infos_getrs);
 
 }}  // namespace at::native
 
diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
index 88e4d2f9a8e3..69a366cc9cd5 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -47,7 +47,9 @@ Tensor prepare_batch_matrix_for_cublas(const Tensor& tensor, bool& transpose_ten
     ld_tensor = tensor_strides[fast_dim];
   } else {
     transpose_tensor = !transpose_result;
-    if (tensor.is_contiguous()) {
+    // gemm call requires leading dimension and stride parameters to be non-zero
+    bool is_stride_non_zero = tensor.stride(1) != 0 && tensor.stride(2) != 0;
+    if (tensor.is_contiguous() && is_stride_non_zero) {
       tensor_ = tensor;
     } else {
       tensor_ = tensor.clone(at::MemoryFormat::Contiguous);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 48692e792ae3..5efbc2f23080 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -10047,6 +10047,27 @@
   dispatch:
     DefaultBackend: linalg_eigvalsh_out
 
+- func: _linalg_inv_out_helper_(Tensor(a!) self, Tensor(b!) infos_lu, Tensor(c!) infos_getri) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function
+  dispatch:
+    CPU: _linalg_inv_out_helper_cpu
+    CUDA: _linalg_inv_out_helper_cuda
+
+- func: linalg_inv(Tensor self) -> Tensor
+  python_module: linalg
+  use_c10_dispatcher: full
+  variants: function
+  dispatch:
+    DefaultBackend: linalg_inv
+
+- func: linalg_inv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  variants: function
+  dispatch:
+    DefaultBackend: linalg_inv_out
+
 - func: inner(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst
index 36f91627d522..761d0e97a1e9 100644
--- a/docs/source/linalg.rst
+++ b/docs/source/linalg.rst
@@ -22,3 +22,4 @@ Functions
 .. autofunction:: solve
 .. autofunction:: tensorinv
 .. autofunction:: tensorsolve
+.. autofunction:: inv
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 8402be471a88..742bfc9e35be 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -1993,18 +1993,19 @@ def func(root, b, upper):
     @skipCUDAIfNoMagmaAndNoCusolver
     @skipCPUIfNoLapack
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
-    @precisionOverride({torch.float32: 2e-3, torch.complex64: 2e-3})
+    @precisionOverride({torch.float32: 2e-3, torch.complex64: 2e-3,
+                        torch.float64: 1e-8, torch.complex128: 1e-8})
     def test_inverse(self, device, dtype):
         from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value
 
-        def run_test(matrix, batches, n):
-            matrix_inverse = torch.inverse(matrix)
+        def run_test(torch_inverse, matrix, batches, n):
+            matrix_inverse = torch_inverse(matrix)
 
             # Compare against NumPy output
             # NumPy uses 'gesv' LAPACK routine solving the equation A A_inv = I
             # But in PyTorch 'gertf' + 'getri' is used causing element-wise differences
             expected = np.linalg.inv(matrix.cpu().numpy())
-            self.assertEqual(matrix_inverse, expected, atol=self.precision, rtol=1e-4)
+            self.assertEqual(matrix_inverse, expected, atol=self.precision, rtol=self.precision)
 
             # Additional correctness tests, check matrix*matrix_inverse == identity
             identity = torch.eye(n, dtype=dtype, device=device)
@@ -2012,44 +2013,49 @@ def run_test(matrix, batches, n):
             self.assertEqual(identity.expand_as(matrix), torch.matmul(matrix_inverse, matrix))
 
             # check the out= variant
+            # prepare the expected out tensor
             matrix_inverse_out = torch.empty(*batches, n, n, dtype=dtype, device=device)
-            ans = torch.inverse(matrix, out=matrix_inverse_out)
+            matrix_inverse_out_t = matrix_inverse_out.transpose(-2, -1).clone(memory_format=torch.contiguous_format)
+            matrix_inverse_out = matrix_inverse_out_t.transpose(-2, -1)
+            ans = torch_inverse(matrix, out=matrix_inverse_out)
             self.assertEqual(matrix_inverse_out, ans, atol=0, rtol=0)
             self.assertEqual(matrix_inverse_out, matrix_inverse, atol=0, rtol=0)
 
             # batched matrices: 3+ dimensional tensors, check matrix_inverse same as single-inverse for each matrix
-            if matrix.ndim > 2:
+            if matrix.ndim > 2 and batches[0] != 0:
                 expected_inv_list = []
                 p = int(np.prod(batches))  # use `p` instead of -1, so that the test works for empty input as well
                 for mat in matrix.contiguous().view(p, n, n):
-                    expected_inv_list.append(torch.inverse(mat))
+                    expected_inv_list.append(torch_inverse(mat))
                 expected_inv = torch.stack(expected_inv_list).view(*batches, n, n)
                 if self.device_type == 'cuda' and dtype in [torch.float32, torch.complex64]:
                     # single-inverse is done using cuSOLVER, while batched inverse is done using MAGMA
                     # individual values can be significantly different for fp32, hence rather high rtol is used
-                    # the important thing is that torch.inverse passes above checks with identity
+                    # the important thing is that torch_inverse passes above checks with identity
                     self.assertEqual(matrix_inverse, expected_inv, atol=1e-1, rtol=1e-2)
                 else:
                     self.assertEqual(matrix_inverse, expected_inv)
 
-        for batches, n in itertools.product(
-            [[], [1], [4], [2, 3]],
-            [0, 5, 64]
-        ):
-            # large batch size and large matrix size will be tested in test_inverse_many_batches (slow test)
-            if batches and batches[0] == 32 and n == 256:
-                continue
-            matrices = random_fullrank_matrix_distinct_singular_value(n, *batches, dtype=dtype).to(device)
-            run_test(matrices, batches, n)
-
-            # test non-contiguous input
-            run_test(matrices.transpose(-2, -1), batches, n)
-            if n > 0:
-                run_test(
-                    random_fullrank_matrix_distinct_singular_value(n * 2, *batches, dtype=dtype).to(device)
-                    .view(-1, n * 2, n * 2)[:, ::2, ::2].view(*batches, n, n),
-                    batches, n
-                )
+        for torch_inverse in [torch.inverse, torch.linalg.inv]:
+            for batches, n in itertools.product(
+                [[], [0], [1], [4], [2, 3]],
+                [0, 5, 64]
+            ):
+                # large batch size and large matrix size will be tested in test_inverse_many_batches (slow test)
+                if batches and batches[0] == 32 and n == 256:
+                    continue
+                matrices = random_fullrank_matrix_distinct_singular_value(n, *batches, dtype=dtype).to(device)
+                run_test(torch_inverse, matrices, batches, n)
+
+                # test non-contiguous input
+                run_test(torch_inverse, matrices.transpose(-2, -1), batches, n)
+                if n > 0:
+                    run_test(
+                        torch_inverse,
+                        random_fullrank_matrix_distinct_singular_value(n * 2, *batches, dtype=dtype).to(device)
+                        .view(-1, n * 2, n * 2)[:, ::2, ::2].view(*batches, n, n),
+                        batches, n
+                    )
 
     @slowTest
     @skipCUDAIfNoMagmaAndNoCusolver
@@ -2060,17 +2066,18 @@ def run_test(matrix, batches, n):
     def test_inverse_many_batches(self, device, dtype):
         from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value
 
-        def test_inverse_many_batches_helper(b, n):
+        def test_inverse_many_batches_helper(torch_inverse, b, n):
             matrices = random_fullrank_matrix_distinct_singular_value(b, n, n, dtype=dtype).to(device)
-            matrices_inverse = torch.inverse(matrices)
+            matrices_inverse = torch_inverse(matrices)
 
             # Compare against NumPy output
             expected = np.linalg.inv(matrices.cpu().numpy())
-            self.assertEqual(matrices_inverse, expected, atol=self.precision, rtol=1e-4)
+            self.assertEqual(matrices_inverse, expected, atol=self.precision, rtol=1e-3)
 
-        test_inverse_many_batches_helper(5, 256)
-        test_inverse_many_batches_helper(3, 512)
-        test_inverse_many_batches_helper(64, 64)
+        for torch_inverse in [torch.inverse, torch.linalg.inv]:
+            test_inverse_many_batches_helper(torch_inverse, 5, 256)
+            test_inverse_many_batches_helper(torch_inverse, 3, 512)
+            test_inverse_many_batches_helper(torch_inverse, 64, 64)
 
     @skipCUDAIfNoMagmaAndNoCusolver
     @skipCPUIfNoLapack
@@ -2091,6 +2098,48 @@ def run_test_singular_input(batch_dim, n):
         for params in [(1, 0), (2, 0), (2, 1), (4, 0), (4, 2), (10, 2)]:
             run_test_singular_input(*params)
 
+    @skipCUDAIfNoMagmaAndNoCusolver
+    @skipCPUIfNoLapack
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    def test_inv_errors(self, device, dtype):
+        # inv expects batches of square matrices as input
+        a = torch.randn(2, 3, 4, 3, dtype=dtype, device=device)
+        with self.assertRaisesRegex(RuntimeError, "must be batches of square matrices"):
+            torch.linalg.inv(a)
+
+        # inv requires the input to be at least 2 dimensional tensor
+        a = torch.randn(2, device=device, dtype=dtype)
+        with self.assertRaisesRegex(RuntimeError, "must have at least 2 dimensions"):
+            torch.linalg.inv(a)
+
+        # if input is not invertible, RuntimeError is raised mentioning the first non-invertible batch
+        def run_test_singular_input(batch_dim, n):
+            a = torch.eye(3, 3, dtype=dtype, device=device).reshape((1, 3, 3)).repeat(batch_dim, 1, 1)
+            a[n, -1, -1] = 0
+            with self.assertRaisesRegex(RuntimeError, rf"For batch {n}: U\(3,3\) is zero"):
+                torch.linalg.inv(a)
+
+        for params in [(1, 0), (2, 0), (2, 1), (4, 0), (4, 2), (10, 2)]:
+            run_test_singular_input(*params)
+
+        # if non-empty out tensor with wrong shape is passed an error is thrown
+        a = torch.randn(2, 3, 3, device=device, dtype=dtype)
+        out = torch.empty(1, device=device, dtype=dtype)
+        with self.assertRaisesRegex(RuntimeError, "does not match input shape"):
+            torch.linalg.inv(a, out=out)
+
+        # dtypes should match
+        out = torch.empty_like(a).to(torch.int)
+        with self.assertRaisesRegex(RuntimeError, "result dtype Int does not match input dtype"):
+            torch.linalg.inv(a, out=out)
+
+        # device should match
+        if torch.cuda.is_available():
+            wrong_device = 'cpu' if self.device_type != 'cpu' else 'cuda'
+            out = torch.empty(0, device=wrong_device, dtype=dtype)
+            with self.assertRaisesRegex(RuntimeError, "does not match input device"):
+                torch.linalg.inv(a, out=out)
+
     def solve_test_helper(self, A_dims, b_dims, device, dtype):
         from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value
 
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 9f68622e7691..802eb3fed71a 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -596,6 +596,9 @@
 - name: inverse(Tensor self) -> Tensor
   self: -at::matmul(result.conj().transpose(-2, -1), at::matmul(grad, result.conj().transpose(-2, -1)))
 
+- name: linalg_inv(Tensor self) -> Tensor
+  self: -at::matmul(result.conj().transpose(-2, -1), at::matmul(grad, result.conj().transpose(-2, -1)))
+
 - name: isnan(Tensor self) -> Tensor
   self: non_differentiable
 
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 03fbf34034ea..1f1620a1c418 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -78,7 +78,7 @@
     'bmm', 'diagonal', 'alias', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
     'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'take', 'fill_',
     'exp', 'nonzero', 'mean', 'inverse', 'solve', 'linalg_cholesky', 'addcmul', 'addcdiv',
-    'matrix_exp', 'linalg_eigh', 'cholesky_solve', 'qr', 'svd',
+    'matrix_exp', 'linalg_eigh', 'cholesky_solve', 'qr', 'svd', 'linalg_inv',
     '_fft_c2c', '_fft_r2c', 'linalg_solve', 'sqrt'
 }
 
diff --git a/torch/csrc/api/include/torch/linalg.h b/torch/csrc/api/include/torch/linalg.h
index f01755c45a74..22e3318d331a 100644
--- a/torch/csrc/api/include/torch/linalg.h
+++ b/torch/csrc/api/include/torch/linalg.h
@@ -84,6 +84,14 @@ inline Tensor& tensorsolve_out(Tensor& result, const Tensor& self, const Tensor&
   return torch::linalg_tensorsolve_out(result, self, other, dims);
 }
 
+inline Tensor inv(const Tensor& input) {
+  return torch::linalg_inv(input);
+}
+
+inline Tensor& inv_out(Tensor& result, const Tensor& input) {
+  return torch::linalg_inv_out(result, input);
+}
+
 } // namespace detail
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
@@ -205,4 +213,15 @@ inline Tensor& tensorsolve_out(Tensor& result, const Tensor& input, const Tensor
   return detail::tensorsolve_out(result, input, other, dims);
 }
 
+/// Computes a tensor `inverse_input` such that `dot(input, inverse_input) = eye(input.size(0))`.
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.inv
+inline Tensor inv(const Tensor& input) {
+  return detail::inv(input);
+}
+
+inline Tensor& inv_out(Tensor& result, const Tensor& input) {
+  return detail::inv_out(result, input);
+}
+
 }} // torch::linalg
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index 575bc92534be..f80aa8da045d 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -82,6 +82,65 @@
     True
 """)
 
+inv = _add_docstr(_linalg.linalg_inv, r"""
+linalg.inv(input, *, out=None) -> Tensor
+
+This function computes the "multiplicative inverse" matrix of a square matrix, or batch of such matrices, :attr:`input`.
+The result satisfies the relation
+
+``matmul(inv(input), input) = matmul(input, inv(input)) = eye(input.shape[0]).expand_as(input)``.
+
+Supports input of float, double, cfloat and cdouble data types.
+
+.. note:: If :attr:`input` is a non-invertible matrix or non-square matrix, or batch with at least one such matrix,
+          then a RuntimeError will be thrown.
+
+.. note:: When given inputs on a CUDA device, this function synchronizes that device with the CPU.
+
+Args:
+    input (Tensor): the square :math:`n \times n` matrix or the batch
+                    of such matrices of size :math:`(*, n, n)` where `*` is one or more batch dimensions.
+
+Keyword args:
+    out (Tensor, optional): The output tensor. Ignored if None. Default: None
+
+Examples::
+
+    >>> x = torch.rand(4, 4)
+    >>> y = torch.linalg.inv(x)
+    >>> z = torch.mm(x, y)
+    >>> z
+    tensor([[ 1.0000, -0.0000, -0.0000,  0.0000],
+            [ 0.0000,  1.0000,  0.0000,  0.0000],
+            [ 0.0000,  0.0000,  1.0000,  0.0000],
+            [ 0.0000, -0.0000, -0.0000,  1.0000]])
+    >>> torch.max(torch.abs(z - torch.eye(4))) # Max non-zero
+    tensor(1.1921e-07)
+
+    >>> # Batched inverse example
+    >>> x = torch.randn(2, 3, 4, 4)
+    >>> y = torch.linalg.inv(x)
+    >>> z = torch.matmul(x, y)
+    >>> torch.max(torch.abs(z - torch.eye(4).expand_as(x))) # Max non-zero
+    tensor(1.9073e-06)
+
+    >>> x = torch.rand(4, 4, dtype=torch.cdouble)
+    >>> y = torch.linalg.inv(x)
+    >>> z = torch.mm(x, y)
+    >>> z
+    tensor([[ 1.0000e+00+0.0000e+00j, -1.3878e-16+3.4694e-16j,
+            5.5511e-17-1.1102e-16j,  0.0000e+00-1.6653e-16j],
+            [ 5.5511e-16-1.6653e-16j,  1.0000e+00+6.9389e-17j,
+            2.2204e-16-1.1102e-16j, -2.2204e-16+1.1102e-16j],
+            [ 3.8858e-16-1.2490e-16j,  2.7756e-17+3.4694e-17j,
+            1.0000e+00+0.0000e+00j, -4.4409e-16+5.5511e-17j],
+            [ 4.4409e-16+5.5511e-16j, -3.8858e-16+1.8041e-16j,
+            2.2204e-16+0.0000e+00j,  1.0000e+00-3.4694e-16j]],
+        dtype=torch.complex128)
+    >>> torch.max(torch.abs(z - torch.eye(4, dtype=torch.cdouble))) # Max non-zero
+    tensor(7.5107e-16, dtype=torch.float64)
+""")
+
 det = _add_docstr(_linalg.linalg_det, r"""
 linalg.det(input) -> Tensor
 
diff --git a/torch/overrides.py b/torch/overrides.py
index 6c193b273344..e2396c806ccf 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -465,6 +465,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
                               cudnn_enabled: -1),
         torch.int_repr: lambda input: -1,
         torch.inverse: lambda input, out=None: -1,
+        torch.linalg.inv: lambda input, out=None: -1,
         torch.is_complex: lambda input: -1,
         torch.is_distributed: lambda input: -1,
         torch.is_floating_point: lambda input: -1,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 72a167aa7ca9..80cd0d41f707 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -309,6 +309,31 @@ def sample_inputs_xlogy(self, device, dtype, requires_grad):
                                      low=0, high=None,
                                      requires_grad=requires_grad))),)
 
+def sample_inputs_linalg_inv(op_info, device, dtype, requires_grad=False):
+    """
+    This function generates always invertible input for torch.linalg.inv using
+    random_fullrank_matrix_distinct_singular_value.
+    The input is generated as the itertools.product of 'batches' and 'ns'.
+    In total this function generates 8 SampleInputs
+    'batches' cases include:
+        () - single input,
+        (0,) - zero batched dimension,
+        (2,) - batch of two matrices,
+        (2, 3) - 2x3 batch of matrices
+    'ns' gives 0x0 and 5x5 matrices.
+    Zeros in dimensions are edge cases in the implementation and important to test for in order to avoid unexpected crashes.
+    """
+    from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value
+
+    batches = [(), (0, ), (2, ), (2, 3)]
+    ns = [0, 5]
+    out = []
+    for batch, n in product(batches, ns):
+        a = random_fullrank_matrix_distinct_singular_value(n, *batch, dtype=dtype).to(device)
+        a.requires_grad = requires_grad
+        out.append(SampleInput(a))
+    return out
+
 def np_sinc_with_fp16_as_fp32(x):
     # Wraps numpy's sinc function so that fp16 values are promoted to fp32
     # before sinc is invoked. Context: numpy's sinc returns NaN when evaluated
@@ -1013,6 +1038,14 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                                 dtypes=[torch.bfloat16])),
                    promotes_integers_to_float=True,
                    handles_complex_extremals=False),
+    OpInfo('linalg.inv',
+           aten_name='linalg_inv',
+           op=torch.linalg.inv,
+           dtypes=floating_and_complex_types(),
+           test_inplace_grad=False,
+           supports_tensor_out=True,
+           sample_inputs_func=sample_inputs_linalg_inv,
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack]),
     UnaryUfuncInfo('angle',
                    ref=np.angle,
                    dtypes=all_types_and_complex_and(torch.bool),

From b3387139b42b48b51b8d66f9c511ecbfa00c0fe4 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Wed, 23 Dec 2020 11:29:45 -0800
Subject: [PATCH 32/45] Mod lists to neutral+descriptive terms in caffe2/docs
 (#49803)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49803

Per "https://fb.workplace.com/groups/e/permalink/3320810064641820/" we can no longer use the terms "whitelist" and "blacklist", and editing any file containing them results in a critical error signal. Let's embrace the change.
This diff changes "blacklist" to "blocklist" in a number of non-interface contexts (interfaces would require more extensive testing and might interfere with reading stored data, so those are deferred until later).

Test Plan: Sandcastle

Reviewed By: vkuzo

Differential Revision: D25686924

fbshipit-source-id: 117de2ca43a0ea21b6e465cf5082e605e42adbf6
---
 docs/source/mobile_optimizer.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/mobile_optimizer.rst b/docs/source/mobile_optimizer.rst
index 3067a2db4379..bb11abf82dba 100644
--- a/docs/source/mobile_optimizer.rst
+++ b/docs/source/mobile_optimizer.rst
@@ -5,14 +5,14 @@ torch.utils.mobile_optimizer
     This API is in beta and may change in the near future.
 
 Torch mobile supports ``torch.mobile_optimizer.optimize_for_mobile`` utility to run a list of optimization pass with modules in eval mode.
-The method takes the following parameters: a torch.jit.ScriptModule object, a blacklisting optimization set and a preserved method list
+The method takes the following parameters: a torch.jit.ScriptModule object, a blocklisting optimization set and a preserved method list
 
-By default, if optimization blacklist is None or empty, ``optimize_for_mobile`` will run the following optimizations:
-    - **Conv2D + BatchNorm fusion** (blacklisting option `MobileOptimizerType::CONV_BN_FUSION`):  This optimization pass folds ``Conv2d-BatchNorm2d`` into ``Conv2d`` in ``forward`` method of this module and all its submodules. The weight and bias of the ``Conv2d`` are correspondingly updated.
-    - **Insert and Fold prepacked ops** (blacklisting option `MobileOptimizerType::INSERT_FOLD_PREPACK_OPS`): This optimization pass rewrites the graph to replace 2D convolutions and linear ops with their prepacked counterparts. Prepacked ops are stateful ops in that, they require some state to be created, such as weight prepacking and use this state, i.e. prepacked weights, during op execution. XNNPACK is one such backend that provides prepacked ops, with kernels optimized for mobile platforms (such as ARM CPUs). Prepacking of weight enables efficient memory access and thus faster kernel execution. At the moment ``optimize_for_mobile`` pass rewrites the graph to replace ``Conv2D/Linear`` with 1) op that pre-packs weight for XNNPACK conv2d/linear ops and 2) op that takes pre-packed weight and activation as input and generates output activations. Since 1 needs to be done only once, we fold the weight pre-packing such that it is done only once at model load time. This pass of the ``optimize_for_mobile`` does 1 and 2 and then folds, i.e. removes, weight pre-packing ops.
+By default, if optimization blocklist is None or empty, ``optimize_for_mobile`` will run the following optimizations:
+    - **Conv2D + BatchNorm fusion** (blocklisting option `MobileOptimizerType::CONV_BN_FUSION`):  This optimization pass folds ``Conv2d-BatchNorm2d`` into ``Conv2d`` in ``forward`` method of this module and all its submodules. The weight and bias of the ``Conv2d`` are correspondingly updated.
+    - **Insert and Fold prepacked ops** (blocklisting option `MobileOptimizerType::INSERT_FOLD_PREPACK_OPS`): This optimization pass rewrites the graph to replace 2D convolutions and linear ops with their prepacked counterparts. Prepacked ops are stateful ops in that, they require some state to be created, such as weight prepacking and use this state, i.e. prepacked weights, during op execution. XNNPACK is one such backend that provides prepacked ops, with kernels optimized for mobile platforms (such as ARM CPUs). Prepacking of weight enables efficient memory access and thus faster kernel execution. At the moment ``optimize_for_mobile`` pass rewrites the graph to replace ``Conv2D/Linear`` with 1) op that pre-packs weight for XNNPACK conv2d/linear ops and 2) op that takes pre-packed weight and activation as input and generates output activations. Since 1 needs to be done only once, we fold the weight pre-packing such that it is done only once at model load time. This pass of the ``optimize_for_mobile`` does 1 and 2 and then folds, i.e. removes, weight pre-packing ops.
     - **ReLU/Hardtanh fusion**: XNNPACK ops support fusion of clamping. That is clamping of output activation is done as part of the kernel, including for 2D convolution and linear op kernels. Thus clamping effectively comes for free. Thus any op that can be expressed as clamping op, such as ``ReLU`` or ``hardtanh``, can be fused with previous ``Conv2D`` or ``linear`` op in XNNPACK. This pass rewrites graph by finding ``ReLU/hardtanh`` ops that follow XNNPACK ``Conv2D/linear`` ops, written by the previous pass, and fuses them together.
-    - **Dropout removal** (blacklisting option `MobileOptimizerType::REMOVE_DROPOUT`): This optimization pass removes ``dropout`` and ``dropout_`` nodes from this module when training is false.
-    - **Conv packed params hoisting** (blacklisting option `MobileOptimizerType::HOIST_CONV_PACKED_PARAMS`): This optimization pass moves convolution packed params to the root module, so that the convolution structs can be deleted. This decreases model size without impacting numerics.
+    - **Dropout removal** (blocklisting option `MobileOptimizerType::REMOVE_DROPOUT`): This optimization pass removes ``dropout`` and ``dropout_`` nodes from this module when training is false.
+    - **Conv packed params hoisting** (blocklisting option `MobileOptimizerType::HOIST_CONV_PACKED_PARAMS`): This optimization pass moves convolution packed params to the root module, so that the convolution structs can be deleted. This decreases model size without impacting numerics.
 
 ``optimize_for_mobile`` will also invoke freeze_module pass which only preserves ``forward`` method. If you have other method to that needed to be preserved,  add them into the preserved method list and pass into the method.
 

From d99a0c3b3e69621505692d0c69a9265a13f5761c Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Wed, 23 Dec 2020 12:20:52 -0800
Subject: [PATCH 33/45] Improve docs for scatter and gather functions (#49679)

Summary:
- Add warning about non-unique indices
- And note that these functions don't broadcast
- Add missing `torch.scatter` and `torch.scatter_add` doc entries
- Fix parameter descriptions
- Improve code examples to make indexing behaviour easier to understand

Closes gh-48214
Closes gh-26191
Closes gh-37130
Closes gh-34062
xref gh-31776

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49679

Reviewed By: mruberry

Differential Revision: D25693660

Pulled By: ngimel

fbshipit-source-id: 4983e7b4efcbdf1ab9f04e58973b4f983e8e43a4
---
 docs/source/torch.rst |   2 +
 torch/_tensor_docs.py | 109 +++++++++++++++++++++++++-----------------
 torch/_torch_docs.py  |  28 ++++++++---
 3 files changed, 89 insertions(+), 50 deletions(-)

diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 3057339aa811..46960ecdb1b4 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -97,6 +97,8 @@ Indexing, Slicing, Joining, Mutating Ops
     nonzero
     reshape
     row_stack
+    scatter
+    scatter_add
     split
     squeeze
     stack
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index e9443202785d..588c59ef98a6 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -3127,14 +3127,25 @@ def callable(a, b) -> number
 
 This is the reverse operation of the manner described in :meth:`~Tensor.gather`.
 
-:attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should have same
-number of dimensions. It is also required that ``index.size(d) <= src.size(d)``
-for all dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all
-dimensions ``d != dim``.
+:attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should all have
+the same number of dimensions. It is also required that
+``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+Note that ``index`` and ``src`` do not broadcast.
 
 Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
-between ``0`` and ``self.size(dim) - 1`` inclusive, and all values in a row
-along the specified dimension :attr:`dim` must be unique.
+between ``0`` and ``self.size(dim) - 1`` inclusive.
+
+.. warning::
+
+    When indices are not unique, the behavior is non-deterministic (one of the
+    values from ``src`` will be picked arbitrarily) and the gradient will be
+    incorrect (it will be propagated to all locations in the source that
+    correspond to the same index)!
+
+.. note::
+
+    The backward pass is implemented only for ``src.shape == index.shape``.
 
 Additionally accepts an optional :attr:`reduce` argument that allows
 specification of an optional reduction operation, which is applied to all
@@ -3156,36 +3167,39 @@ def callable(a, b) -> number
 
 Args:
     dim (int): the axis along which to index
-    index (LongTensor): the indices of elements to scatter,
-      can be either empty or the same size of src.
-      When empty, the operation returns identity
-    src (Tensor): the source element(s) to scatter,
-      incase `value` is not specified
-    value (float): the source element(s) to scatter,
-      incase `src` is not specified
-    reduce (string): reduction operation to apply,
-      can be either 'add' or 'multiply'.
+    index (LongTensor): the indices of elements to scatter, can be either empty
+        or of the same dimensionality as ``src``. When empty, the operation
+        returns ``self`` unchanged.
+    src (Tensor or float): the source element(s) to scatter.
+    reduce (str, optional): reduction operation to apply, can be either
+        ``'add'`` or ``'multiply'``.
 
 Example::
 
-    >>> x = torch.rand(2, 5)
-    >>> x
-    tensor([[ 0.3992,  0.2908,  0.9044,  0.4850,  0.6004],
-            [ 0.5735,  0.9006,  0.6797,  0.4152,  0.1732]])
-    >>> torch.zeros(3, 5).scatter_(0, torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]), x)
-    tensor([[ 0.3992,  0.9006,  0.6797,  0.4850,  0.6004],
-            [ 0.0000,  0.2908,  0.0000,  0.4152,  0.0000],
-            [ 0.5735,  0.0000,  0.9044,  0.0000,  0.1732]])
-
-    >>> z = torch.zeros(2, 4).scatter_(1, torch.tensor([[2], [3]]), 1.23)
-    >>> z
-    tensor([[ 0.0000,  0.0000,  1.2300,  0.0000],
-            [ 0.0000,  0.0000,  0.0000,  1.2300]])
+    >>> src = torch.arange(1, 11).reshape((2, 5))
+    >>> src
+    tensor([[ 1,  2,  3,  4,  5],
+            [ 6,  7,  8,  9, 10]])
+    >>> index = torch.tensor([[0, 1, 2, 0]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(0, index, src)
+    tensor([[1, 0, 0, 4, 0],
+            [0, 2, 0, 0, 0],
+            [0, 0, 3, 0, 0]])
+    >>> index = torch.tensor([[0, 1, 2], [0, 1, 4]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(1, index, src)
+    tensor([[1, 2, 3, 0, 0],
+            [6, 7, 0, 0, 8],
+            [0, 0, 0, 0, 0]])
+
+    >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+    ...            1.23, reduce='multiply')
+    tensor([[2.0000, 2.0000, 2.4600, 2.0000],
+            [2.0000, 2.0000, 2.0000, 2.4600]])
+    >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+    ...            1.23, reduce='add')
+    tensor([[2.0000, 2.0000, 3.2300, 2.0000],
+            [2.0000, 2.0000, 2.0000, 3.2300]])
 
-    >>> z = torch.ones(2, 4).scatter_(1, torch.tensor([[2], [3]]), 1.23, reduce='multiply')
-    >>> z
-    tensor([[1.0000, 1.0000, 1.2300, 1.0000],
-            [1.0000, 1.0000, 1.0000, 1.2300]])
 """)
 
 add_docstr_all('scatter_add_',
@@ -3208,28 +3222,35 @@ def callable(a, b) -> number
 :attr:`self`, :attr:`index` and :attr:`src` should have same number of
 dimensions. It is also required that ``index.size(d) <= src.size(d)`` for all
 dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all dimensions
-``d != dim``.
+``d != dim``. Note that ``index`` and ``src`` do not broadcast.
 
 Note:
     {forward_reproducibility_note}
 
+.. note::
+
+    The backward pass is implemented only for ``src.shape == index.shape``.
+
 Args:
     dim (int): the axis along which to index
-    index (LongTensor): the indices of elements to scatter and add,
-      can be either empty or the same size of src.
-      When empty, the operation returns identity.
+    index (LongTensor): the indices of elements to scatter and add, can be
+        either empty or of the same dimensionality as ``src``. When empty, the
+        operation returns ``self`` unchanged.
     src (Tensor): the source elements to scatter and add
 
 Example::
 
-    >>> x = torch.rand(2, 5)
-    >>> x
-    tensor([[0.7404, 0.0427, 0.6480, 0.3806, 0.8328],
-            [0.7953, 0.2009, 0.9154, 0.6782, 0.9620]])
-    >>> torch.ones(3, 5).scatter_add_(0, torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]), x)
-    tensor([[1.7404, 1.2009, 1.9154, 1.3806, 1.8328],
-            [1.0000, 1.0427, 1.0000, 1.6782, 1.0000],
-            [1.7953, 1.0000, 1.6480, 1.0000, 1.9620]])
+    >>> src = torch.ones((2, 5))
+    >>> index = torch.tensor([[0, 1, 2, 0, 0]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_add_(0, index, src)
+    tensor([[1., 0., 0., 1., 1.],
+            [0., 1., 0., 0., 0.],
+            [0., 0., 1., 0., 0.]])
+    >>> index = torch.tensor([[0, 1, 2, 0, 0], [0, 1, 2, 2, 2]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_add_(0, index, src)
+    tensor([[2., 0., 0., 1., 1.],
+            [0., 2., 0., 0., 0.],
+            [0., 0., 2., 1., 1.]])
 
 """.format(**reproducibility_notes))
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 073883b60407..d46a6b1bcf84 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3111,7 +3111,6 @@ def merge_dicts(*dicts):
             [5, 6, 7, 8]])
 """.format(**common_args))
 
-# TODO: see https://github.com/pytorch/pytorch/issues/43667
 add_docstr(torch.gather,
            r"""
 gather(input, dim, index, *, sparse_grad=False, out=None) -> Tensor
@@ -3128,19 +3127,22 @@ def merge_dicts(*dicts):
 :math:`(x_0, x_1..., x_{i-1}, x_i, x_{i+1}, ..., x_{n-1})`
 and ``dim = i``, then :attr:`index` must be an :math:`n`-dimensional tensor with
 size :math:`(x_0, x_1, ..., x_{i-1}, y, x_{i+1}, ..., x_{n-1})` where :math:`y \geq 1`
-and :attr:`out` will have the same size as :attr:`index`.
-""" + r"""
+and :attr:`out` will have the same size as :attr:`index`.  Note that ``input``
+and ``index`` do not broadcast against each other.
+
 Args:
     input (Tensor): the source tensor
     dim (int): the axis along which to index
     index (LongTensor): the indices of elements to gather
-    sparse_grad(bool,optional): If ``True``, gradient w.r.t. :attr:`input` will be a sparse tensor.
+
+Keyword arguments:
+    sparse_grad (bool, optional): If ``True``, gradient w.r.t. :attr:`input` will be a sparse tensor.
     out (Tensor, optional): the destination tensor
 
 Example::
 
-    >>> t = torch.tensor([[1,2],[3,4]])
-    >>> torch.gather(t, 1, torch.tensor([[0,0],[1,0]]))
+    >>> t = torch.tensor([[1, 2], [3, 4]])
+    >>> torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]]))
     tensor([[ 1,  1],
             [ 4,  3]])
 """)
@@ -7338,6 +7340,20 @@ def merge_dicts(*dicts):
     tensor([    nan,  1.8351,  0.8053,     nan])
 """.format(**common_args))
 
+add_docstr(torch.scatter,
+           r"""
+scatter(input, dim, index, src) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_`
+""")
+
+add_docstr(torch.scatter_add,
+           r"""
+scatter_add(input, dim, index, src) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_add_`
+""")
+
 add_docstr(torch.set_flush_denormal,
            r"""
 set_flush_denormal(mode) -> bool

From e1631729049498b1fa45bfa8bcbf66fb638beaa8 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Wed, 23 Dec 2020 12:36:41 -0800
Subject: [PATCH 34/45] removes more unused THC functions (#49788)

Summary:
per title

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49788

Reviewed By: mruberry

Differential Revision: D25693328

Pulled By: ngimel

fbshipit-source-id: 244a096214d110e4c1a94f2847ff8457f1afb0d1
---
 aten/src/ATen/LegacyTHFunctionsCUDA.h         |  1 -
 aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp  | 78 -------------------
 .../src/THC/generic/THCTensorMathPointwise.cu | 55 -------------
 aten/src/THC/generic/THCTensorMathPointwise.h | 15 ----
 aten/src/THCUNN/generic/RReLU.cu              | 49 ------------
 5 files changed, 198 deletions(-)

diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
index df51c071c418..a10c20e41062 100644
--- a/aten/src/ATen/LegacyTHFunctionsCUDA.h
+++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h
@@ -75,7 +75,6 @@ Tensor & _thnn_log_sigmoid_backward_out(Tensor & grad_input, const Tensor & grad
 Tensor _thnn_log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer);
 Tensor & _thnn_rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator);
 Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator);
-Tensor & _thnn_rrelu_with_noise_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training);
 Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training);
 Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator);
 std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding);
diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
index 878c8fb3d8a1..3da913580253 100644
--- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
+++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
@@ -2498,84 +2498,6 @@ Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise,
     }
     return output;
 }
-Tensor & _thnn_rrelu_with_noise_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) {
-    const OptionalDeviceGuard device_guard(device_of(self));
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 6, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaDoubleRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false);
-            break;
-        }
-        case ScalarType::Float: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 6, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false);
-            break;
-        }
-        case ScalarType::Half: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 6, "_thnn_rrelu_with_noise_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaHalfRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false);
-            break;
-        }
-        default:
-            AT_ERROR("_thnn_rrelu_with_noise_backward_out not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return grad_input;
-}
-Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training) {
-    const OptionalDeviceGuard device_guard(device_of(self));
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto grad_input_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto grad_input = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_input_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            THNN_CudaDoubleRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false);
-            break;
-        }
-        case ScalarType::Float: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            THNN_CudaRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false);
-            break;
-        }
-        case ScalarType::Half: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 3, "_thnn_rrelu_with_noise_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            THNN_CudaHalfRReLU_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_, noise_, lower_, upper_, training, false);
-            break;
-        }
-        default:
-            AT_ERROR("_thnn_rrelu_with_noise_backward not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return grad_input;
-}
 Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator) {
     const OptionalDeviceGuard device_guard(device_of(self));
     auto dispatch_scalar_type = infer_scalar_type(self);
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu
index c33716c5f565..f7857cddc497 100644
--- a/aten/src/THC/generic/THCTensorMathPointwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPointwise.cu
@@ -11,47 +11,6 @@ static void propagate_names_if_named_tensor_enabled(THCTensor* result, THCTensor
   at::namedinference::propagate_names(result, src);
 }
 
-#define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_(NAME, CFUNC, REAL)             \
-  struct Tensor_##NAME##_##REAL##_Op {                                  \
-    __device__ __forceinline__ void operator()(scalar_t* out, scalar_t* in) const { \
-      *out = CFUNC(*in);                                                \
-    }                                                                   \
-                                                                        \
-    __device__ __forceinline__ void operator()(scalar_t* v) const {         \
-      *v = CFUNC(*v);                                                   \
-    }                                                                   \
-  };                                                                    \
-                                                                        \
-  void THCTensor_(NAME)(THCState* state, THCTensor* self_, THCTensor* src) { \
-    THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));       \
-    at::assert_no_internal_overlap(self_);                              \
-    if (self_ == src) {                                                 \
-      if (!THC_pointwiseApply1<scalar_t>(state, self_, Tensor_##NAME##_##REAL##_Op())) { \
-        THArgCheck(false, 2, CUTORCH_DIM_WARNING);                      \
-      }                                                                 \
-    } else {                                                            \
-      THCTensor_(resizeAs)(state, self_, src);                          \
-                                                                        \
-      if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src, Tensor_##NAME##_##REAL##_Op())) { \
-        THArgCheck(false, 2, CUTORCH_DIM_WARNING);                      \
-      }                                                                 \
-    }                                                                   \
-                                                                        \
-    THCudaCheck(cudaGetLastError());                                    \
-    propagate_names_if_named_tensor_enabled(self_, src);                \
-  }
-
-#define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(NAME, CFUNC, REAL) \
-  IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_(NAME, CFUNC, REAL)
-
-#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
-
-IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( sqrt, THCNumerics<scalar_t>::sqrt,  Real)
-
-#endif
-#undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_
-#undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC
-
 void THCTensor_(crossKernel)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, x, y));
@@ -69,19 +28,5 @@ void THCTensor_(crossKernel)(THCState *state, THCTensor *self, THCTensor *x, THC
   THCTensor_(free)(state, ny);
   THCTensor_(free)(state, nself);
 }
-
-namespace {
-c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl> retainTensorImpl(THCTensor* self) {
-  c10::raw::intrusive_ptr::incref(self);
-  return c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(self);
-}
-}
-
-void THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
-{
-  auto out = at::Tensor(retainTensorImpl(self_));
-  at::mul_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)));
-}
-
 #endif
 #endif
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.h b/aten/src/THC/generic/THCTensorMathPointwise.h
index d22dae49bb0c..8a9ea1ad7885 100644
--- a/aten/src/THC/generic/THCTensorMathPointwise.h
+++ b/aten/src/THC/generic/THCTensorMathPointwise.h
@@ -4,22 +4,7 @@
 
 #if !defined(THC_REAL_IS_BOOL)
 
-#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
-
-TORCH_CUDA_API void THCTensor_(atan)(THCState *state, THCTensor *self, THCTensor *src);
-TORCH_CUDA_API void THCTensor_(sqrt)(THCState *state, THCTensor *self, THCTensor *src);
-
-#endif
-
-TORCH_CUDA_API void THCTensor_(clamp)(THCState *state, THCTensor *self, THCTensor *src, scalar_t min_value, scalar_t max_value);
 TORCH_CUDA_API void THCTensor_(crossKernel)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2, int dimension);
 
-TORCH_CUDA_API void THCTensor_(cadd)(THCState *state, THCTensor *self, THCTensor *src1, scalar_t value, THCTensor *src2);
-TORCH_CUDA_API void THCTensor_(csub)(THCState *state, THCTensor *self, THCTensor *src1, scalar_t value, THCTensor *src2);
-TORCH_CUDA_API void THCTensor_(cmul)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
-TORCH_CUDA_API void THCTensor_(cdiv)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
-TORCH_CUDA_API void THCTensor_(clshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
-TORCH_CUDA_API void THCTensor_(crshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
-
 #endif
 #endif
diff --git a/aten/src/THCUNN/generic/RReLU.cu b/aten/src/THCUNN/generic/RReLU.cu
index 9d664220e5a3..a32013861450 100644
--- a/aten/src/THCUNN/generic/RReLU.cu
+++ b/aten/src/THCUNN/generic/RReLU.cu
@@ -67,53 +67,4 @@ void THNN_(RReLU_updateOutput)(
     }
   }
 }
-
-void THNN_(RReLU_updateGradInput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *gradOutput,
-           THCTensor *gradInput,
-           THCTensor *noise,
-           double lower,
-           double upper,
-           bool train,
-           bool inplace)
-{
-  THCUNN_check_nElement(state, input, gradOutput);
-  THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, noise);
-
-  auto gradOutputTensor = THTensor_wrap(gradOutput).contiguous();
-  gradOutput = gradOutputTensor.unsafeGetTensorImpl();
-
-  if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
-  {
-    // multiply the gradient by the noise tensor
-    if (inplace)
-    {
-      THCTensor_(cmul)(state, gradOutput, gradOutput, noise);
-      THCTensor_(set)(state, gradInput, gradOutput);
-    }
-    else
-    {
-      THCTensor_(resizeAs)(state, gradInput, input);
-      THCTensor_(cmul)(state, gradInput, gradOutput, noise);
-    }
-  }
-  else
-  {
-    // use constant factor for negative input values
-    const scalar_t negSlope = ScalarConvert<double, scalar_t>::to((lower + upper) / 2);
-    if (inplace)
-    {
-      THC_pointwiseApply2<scalar_t, scalar_t>(state, gradOutput, input, RReLUupdateGradInputEvalIP_functor<scalar_t>(negSlope));
-      THCTensor_(set)(state, gradInput, gradOutput);
-    }
-    else
-    {
-      THCTensor_(resizeAs)(state, gradInput, input);
-      THC_pointwiseApply3<scalar_t, scalar_t, scalar_t>(state, gradInput, gradOutput, input, RReLUupdateGradInputEval_functor<scalar_t>(negSlope));
-    }
-  }
-}
-
 #endif

From 4d6110939a7233efeab5a3e48f2eb42f28bee7fb Mon Sep 17 00:00:00 2001
From: Jianyu Huang <jianyuhuang@fb.com>
Date: Wed, 23 Dec 2020 12:43:40 -0800
Subject: [PATCH 35/45] [pt][quant] Make the CUDA fake quantize logic
 consistent with CPU fake quantize logic (#49808)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49808

In PyTorch, it uses `dst = std::nearbyint(src * inv_scale) + zero_point` instead of the LEGACY  `dst = std::nearbyint(src * inv_scale + zero_point)`. However, the CUDA implementation doesn't match this. This Diff makes the CPU and CUDA implementation consistent.

- FBGEMM code pointer: https://github.com/pytorch/FBGEMM/blob/master/include/fbgemm/QuantUtils.h#L76-L80
- PyTorch code pointer:
https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/quantized/affine_quantizer.cpp#L306

Test Plan: CI

Reviewed By: dskhudia

Differential Revision: D25694235

fbshipit-source-id: 0a615e559132aafe18543deac1ea5028dd840cb9
---
 .../native/quantized/cuda/affine_quantizer.cu | 18 +++++----
 .../quantized/cuda/fake_quantize_core.cu      | 39 +++++++++----------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cuda/affine_quantizer.cu b/aten/src/ATen/native/quantized/cuda/affine_quantizer.cu
index 2c0c2a312e07..12f9058f6efd 100644
--- a/aten/src/ATen/native/quantized/cuda/affine_quantizer.cu
+++ b/aten/src/ATen/native/quantized/cuda/affine_quantizer.cu
@@ -25,14 +25,16 @@ void quantize_tensor_per_tensor_affine_cuda(
           .add_input(qtensor)
           .build();
 
-        gpu_kernel(iter,
-          [=] GPU_LAMBDA (float raw_val, scalar_t quantized_val) -> scalar_t {
-            int64_t qvalue = static_cast<int64_t>(nearbyint(raw_val / scale + zero_point));
-            qvalue = std::max<int64_t>(qvalue, qmin);
-            qvalue = std::min<int64_t>(qvalue, qmax);
-            quantized_val.val_ = qvalue;
-            return quantized_val;
-        });
+        gpu_kernel(
+            iter,
+            [=] GPU_LAMBDA(float raw_val, scalar_t quantized_val) -> scalar_t {
+              int64_t qvalue =
+                  static_cast<int64_t>(nearbyint(raw_val / scale) + zero_point);
+              qvalue = std::max<int64_t>(qvalue, qmin);
+              qvalue = std::min<int64_t>(qvalue, qmax);
+              quantized_val.val_ = qvalue;
+              return quantized_val;
+            });
       });
 }
 
diff --git a/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu b/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu
index 8e25f5ff443d..e2f51398b48f 100644
--- a/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu
+++ b/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu
@@ -34,17 +34,16 @@ void fake_quantize_tensor_kernel_cuda(
     .add_output(output)
     .add_input(input)
     .build();
-  gpu_kernel(iter,
-    [=] GPU_LAMBDA (float input_val) -> float {
-      return (fminf(
+  gpu_kernel(iter, [=] GPU_LAMBDA(float input_val) -> float {
+    return (fminf(
                 quant_max,
                 fmaxf(
                     quant_min,
-                    static_cast<int64_t>(std::nearbyint(
-                        input_val * inv_scale + zero_point)))) -
+                    static_cast<int64_t>(
+                        std::nearbyint(input_val * inv_scale) + zero_point))) -
             zero_point) *
-          scale;
-    });
+        scale;
+  });
 }
 
 void fake_quantize_grad_tensor_kernel_cuda(
@@ -63,11 +62,10 @@ void fake_quantize_grad_tensor_kernel_cuda(
     .add_input(output_grad)
     .add_input(input)
     .build();
-  gpu_kernel(iter,
-    [=] GPU_LAMBDA (float dy, float x) -> float {
-      int64_t Xq = std::nearbyint(x * inv_scale + zero_point);
-      return (Xq >= quant_min && Xq <= quant_max) * dy;
-    });
+  gpu_kernel(iter, [=] GPU_LAMBDA(float dy, float x) -> float {
+    int64_t Xq = std::nearbyint(x * inv_scale) + zero_point;
+    return (Xq >= quant_min && Xq <= quant_max) * dy;
+  });
 }
 
 void _fake_quantize_grad_learnable_tensor_kernel_cuda(
@@ -82,7 +80,7 @@ void _fake_quantize_grad_learnable_tensor_kernel_cuda(
   gpu_kernel_multiple_outputs(
     iter, [=] GPU_LAMBDA (float XInput, float dYInput) -> thrust::tuple<float, float, float> {
       float dXOutput, dZeroPointOutput, dScaleOutput;
-      int64_t xq = std::nearbyint(zero_point + XInput * inv_scale);
+      int64_t xq = std::nearbyint(XInput * inv_scale) + zero_point;
       dXOutput = dYInput * (xq >= quant_min && xq <= quant_max);
       xq = std::max(std::min(xq, quant_max), quant_min);
       float xfq = static_cast<float>((xq - zero_point) * scale);
@@ -108,12 +106,13 @@ void fake_quant_per_channel_cuda(TensorIterator &iter, int64_t quant_min, int64_
     [=] GPU_LAMBDA (float input_val, float scale, int64_t zero_point) -> float {
       float inv_scale = 1.0f / scale;
       return (fminf(
-                quant_max,
-                fmaxf(
-                    quant_min,
-                    static_cast<int64_t>(std::nearbyint(
-                        input_val * inv_scale + zero_point)))) -
-            zero_point) *
+                  quant_max,
+                  fmaxf(
+                      quant_min,
+                      static_cast<int64_t>(
+                          std::nearbyint(input_val * inv_scale) +
+                          zero_point))) -
+              zero_point) *
           scale;
     });
 }
@@ -122,7 +121,7 @@ void fake_quant_grad_per_channel_cuda(TensorIterator &iter, int64_t quant_min, i
   gpu_kernel(iter,
     [=] GPU_LAMBDA (float x, float dy, float scale, int64_t zero_point) -> float {
       float inv_scale = 1.0f / scale;
-      int64_t Xq = std::nearbyint(x * inv_scale + zero_point);
+      int64_t Xq = std::nearbyint(x * inv_scale) + zero_point;
       return (Xq >= quant_min && Xq <= quant_max) * dy;
     });
 }

From 3f4b98d568e84967f7f1ff2ed963de6f58f14d5d Mon Sep 17 00:00:00 2001
From: Kshiteej K <kshitijkalambarkar@gmail.com>
Date: Wed, 23 Dec 2020 14:20:08 -0800
Subject: [PATCH 36/45] [numpy] `torch.erfinv`: promote integer inputs to float
 (#49155)

Summary:
Reference: https://github.com/pytorch/pytorch/issues/42515

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49155

Reviewed By: ngimel

Differential Revision: D25664234

Pulled By: mruberry

fbshipit-source-id: 630fd1d334567d78c8130236a67dda0f5ec02560
---
 aten/src/ATen/native/UnaryOps.cpp             |  5 +++-
 aten/src/ATen/native/cuda/UnaryOpsKernel.cu   |  2 +-
 aten/src/ATen/native/native_functions.yaml    |  6 ++---
 test/test_torch.py                            |  1 -
 test/test_unary_ufuncs.py                     | 20 ----------------
 .../_internal/common_methods_invocations.py   | 23 +++++++++++++++++--
 6 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 7ee381dc4374..4eb1f393e47c 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -276,6 +276,10 @@ Tensor& erfc_out(Tensor& result, const Tensor& self) { return unary_op_impl_floa
 Tensor erfc(const Tensor& self) { return unary_op_impl_float(self, erfc_stub); }
 Tensor& erfc_(Tensor& self) { return unary_op_impl_(self, at::erfc_out); }
 
+Tensor& erfinv_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, erfinv_stub); }
+Tensor erfinv(const Tensor& self) { return unary_op_impl_float(self, erfinv_stub); }
+Tensor& erfinv_(Tensor& self) { return unary_op_impl_(self, at::erfinv_out); }
+
 Tensor& frac_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, frac_stub); }
 Tensor frac(const Tensor& self) { return unary_op_impl(self, at::frac_out); }
 Tensor& frac_(Tensor& self) { return unary_op_impl_(self, at::frac_out); }
@@ -683,7 +687,6 @@ Tensor& mvlgamma_(Tensor& self, int64_t p) {
   IMPLEMENT_UNARY_OP_OUT_INPLACE(op, cpu, CPU)                         \
   IMPLEMENT_UNARY_OP_OUT_INPLACE(op, cuda, CUDA)
 
-IMPLEMENT_UNARY_OP_VEC_CUDA(erfinv)
 IMPLEMENT_UNARY_OP_VEC_CUDA(lgamma)
 
 DEFINE_DISPATCH(abs_stub);
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
index 059da1f49f75..f5e1a4e85a04 100644
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -160,7 +160,7 @@ void erfc_kernel_cuda(TensorIterator& iter) {
 }
 
 void erfinv_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "erfinv_cuda", [&]() {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "erfinv_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::erfinv(a);
     });
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 5efbc2f23080..42edbfe12748 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6937,14 +6937,12 @@
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: _erfinv__cpu
-    CUDA: _erfinv__cuda
+    CPU, CUDA: erfinv_
 
 - func: erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   dispatch:
-    CPU: _erfinv_out_cpu
-    CUDA: _erfinv_out_cuda
+    CPU, CUDA: erfinv_out
 
 - func: i0(Tensor self) -> Tensor
   use_c10_dispatcher: full
diff --git a/test/test_torch.py b/test/test_torch.py
index 294aa3bfa920..04fadcb65c66 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6913,7 +6913,6 @@ def inner(self, device, dtype):
     ('atanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('erf', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]),
     ('erfc', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
-    ('erfinv', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
     ('exp', '', _small_3d, lambda t, d: [], 1e-2, 5e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('exp', 'small', lambda t, d: _small_3d(t, d).clamp(-1, 1),
         lambda t, d: [], 1e-2, 5e-2, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]),
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index c96788f113eb..37ef90514803 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -766,26 +766,6 @@ def test_ceil_out_mismatch(self, device):
         b = torch.randn(1, device=device)
         self.assertRaises(RuntimeError, lambda: torch.ceil(a, out=b))
 
-    # TODO: review with erfinv opinfo
-    @dtypesIfCUDA(torch.half, torch.float, torch.double)
-    @dtypes(torch.float, torch.double)
-    def test_erfinv(self, device, dtype):
-        # general testing. Narrow the range to avoid accuracy issues
-        input_values = torch.randn(4, 4, dtype=dtype, device=device).clamp(-0.3, 0.3)
-        self.assertEqual(input_values.erf().erfinv(), input_values)
-        # test inf
-        self.assertTrue(torch.equal(torch.tensor([-1, 1], dtype=dtype, device=device).erfinv(),
-                                    torch.tensor([-inf, inf], dtype=dtype, device=device)))
-        # test nan
-        self.assertEqual(torch.tensor([-2, 2], dtype=dtype, device=device).erfinv(),
-                         torch.tensor([nan, nan], dtype=dtype, device=device))
-
-        if dtype == torch.double:
-            # double precision
-            a = torch.tensor([0.5, 0.8], dtype=torch.double, device=device).erfinv()
-            self.assertEqual(a[0].item(), 0.47693627620447, atol=1e-13, rtol=0)
-            self.assertEqual(a[1].item(), 0.90619380243682, atol=1e-13, rtol=0)
-
     # TODO: opinfo hardshrink
     @onlyCPU
     @dtypes(torch.float, torch.double)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 80cd0d41f707..5ad0f55bc7af 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -25,6 +25,8 @@
      TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, make_tensor, TEST_SCIPY,
      torch_to_numpy_dtype_dict, TEST_WITH_SLOW)
 
+from distutils.version import LooseVersion
+
 if TEST_SCIPY:
     import scipy.special
 
@@ -1139,6 +1141,25 @@ def reference_sigmoid(x):
                                     dtypes=[torch.bfloat16]),),
                        assert_autodiffed=True,
                        promotes_integers_to_float=True),
+        UnaryUfuncInfo('erfinv',
+                       ref=scipy.special.erfinv,
+                       decorators=(precisionOverride({torch.float16: 1e-2,
+                                                      torch.bfloat16: 1e-2,
+                                                      torch.float32: 1e-4}),),
+                       dtypes=all_types_and(torch.bool),
+                       dtypesIfCPU=all_types_and(torch.bool, torch.bfloat16),
+                       dtypesIfCUDA=all_types_and(torch.bool, torch.half),
+                       promotes_integers_to_float=True,
+                       domain=(-1, 1),
+                       skips=(
+                           # Reference: https://github.com/pytorch/pytorch/pull/49155#issuecomment-742664611
+                           SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
+                                    active_if=LooseVersion(scipy.__version__) < "1.4.0"),
+                           # RuntimeError: "pow" not implemented for 'BFloat16'
+                           SkipInfo('TestCommon', 'test_variant_consistency_jit',
+                                    dtypes=[torch.bfloat16]),
+                       )
+                       ),
         OpInfo('xlogy',
                dtypes=all_types_and(torch.bool),
                dtypesIfCPU=all_types_and(torch.bool, torch.half, torch.bfloat16),
@@ -1412,8 +1433,6 @@ def method_tests():
         ('expand_as', (S, 1, 1), (torch.rand(S, S, S),), '', (False,)),
         ('exp', (S, S, S), NO_ARGS, '', (True,)),
         ('exp', (), NO_ARGS, 'scalar', (True,)),
-        ('erfinv', torch.rand(S, S, S).clamp(-0.9, 0.9), NO_ARGS),
-        ('erfinv', normal_scalar_clamp(-0.9, 0.9, requires_grad=True), NO_ARGS, 'scalar'),
         ('logit', torch.randn(S, S, S).clamp(0.1, 0.9).requires_grad_(True), NO_ARGS, ''),
         ('logit', torch.randn(S, S, S).clamp(0.1, 0.9).requires_grad_(True), (0.2,), 'eps'),
         ('logit', uniform_scalar().clamp(0.1, 0.9).requires_grad_(True), NO_ARGS, 'scalar'),

From e6a215592ea5b7f7f7e59e89116b507089bfb8d0 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Wed, 23 Dec 2020 14:21:59 -0800
Subject: [PATCH 37/45] [reland] Early terminate when CUDA assert were thrown
 (#49799)

Summary:
this is a reland of https://github.com/pytorch/pytorch/issues/49527.

fixed slow test not running properly in py36 because capture_output is introduced in py37.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49799

Reviewed By: janeyx99

Differential Revision: D25692616

Pulled By: walterddr

fbshipit-source-id: 9c5352220d632ec8d7464e5f162ffb468a0f30df
---
 test/test_testing.py                          | 52 ++++++++++++++++++-
 torch/testing/_internal/common_device_type.py | 14 +++++
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/test/test_testing.py b/test/test_testing.py
index b87345186cb3..d3bde3289f5e 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -3,9 +3,9 @@
 import math
 
 from torch.testing._internal.common_utils import \
-    (TestCase, run_tests, make_tensor)
+    (TestCase, make_tensor, run_tests, slowTest)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, onlyOnCPUAndCUDA, dtypes)
+    (instantiate_device_type_tests, onlyCUDA, onlyOnCPUAndCUDA, dtypes)
 
 # For testing TestCase methods and torch.testing functions
 class TestTesting(TestCase):
@@ -438,6 +438,54 @@ def test_assert_messages(self, device):
         self.assertEqual("no_user_msg", self._get_assert_msg(msg=None, debug_msg="no_user_msg"))
         self.assertEqual("debug_msg\nuser_msg", self._get_assert_msg(msg="user_msg", debug_msg="debug_msg"))
 
+    @onlyCUDA
+    @slowTest
+    def test_cuda_assert_should_stop_test_suite(self, device):
+        # This test is slow because it spawn another process to run another test suite.
+        import subprocess
+        import sys
+
+        problematic_test_script = """\
+#!/usr/bin/env python
+
+import torch
+
+from torch.testing._internal.common_utils import (TestCase, run_tests)
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+
+# This test is added to ensure that test suite terminates early when
+# CUDA assert was thrown since all subsequent test will fail.
+# See: https://github.com/pytorch/pytorch/issues/49019
+# This test file should be invoked from test_testing.py
+class TestThatContainsCUDAAssertFailure(TestCase):
+
+    def test_throw_unrecoverable_cuda_exception(self, device):
+        x = torch.rand(10, device=device)
+        # cause unrecoverable CUDA exception, recoverable on CPU
+        y = x[torch.tensor([25])].cpu()
+
+    def test_trivial_passing_test_case_on_cpu_cuda(self, device):
+        x1 = torch.tensor([0., 1.], device=device)
+        x2 = torch.tensor([0., 1.], device='cpu')
+        self.assertEqual(x1, x2)
+
+instantiate_device_type_tests(
+    TestThatContainsCUDAAssertFailure,
+    globals(),
+    except_for=None
+)
+
+if __name__ == '__main__':
+    run_tests()
+"""
+
+        # Test running of cuda assert test suite should early terminate.
+        p = subprocess.run([sys.executable, '-c', problematic_test_script], stderr=subprocess.PIPE, timeout=120)
+        # should capture CUDA error
+        self.assertIn('CUDA error: device-side assert triggered', p.stderr.decode('ascii'))
+        # should run only 3 tests - 2 CPUs and 1 CUDA (remaining CUDA test should skip)
+        self.assertIn('Ran 3 tests', p.stderr.decode('ascii'))
+
 instantiate_device_type_tests(TestTesting, globals())
 
 if __name__ == '__main__':
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 36f02eff0c0f..73185116a4f5 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -187,6 +187,9 @@ def _construct_test_name(test_name, op, device_type, dtype):
 class DeviceTypeTestBase(TestCase):
     device_type: str = 'generic_device_type'
 
+    # Flag to disable test suite early due to unrecoverable error such as CUDA error.
+    _stop_test_suite = False
+
     # Precision is a thread-local setting since it may be overridden per test
     _tls = threading.local()
     _tls.precision = TestCase._precision
@@ -271,6 +274,11 @@ def instantiated_test(self, name=name, test=test_fn, dtype=dtype, op=op):
                     self.precision = self._get_precision_override(test_fn, dtype)
                     args = (arg for arg in (device_arg, dtype, op) if arg is not None)
                     result = test_fn(self, *args)
+                except RuntimeError as rte:
+                    if 'CUDA error: device-side assert triggered' in rte.__repr__():
+                        self._stop_test_suite = True
+                    # raise the runtime error as is.
+                    raise rte
                 finally:
                     self.precision = guard_precision
 
@@ -313,6 +321,12 @@ def instantiated_test(self, name=name, test=test_fn, dtype=dtype, op=op):
             for dtype in dtypes:
                 instantiate_test_helper(cls, name, test=test, dtype=dtype, op=None)
 
+    def run(self, result=None):
+        super().run(result=result)
+        # Early terminate test if _stop_test_suite is set.
+        if self._stop_test_suite:
+            result.stop()
+
 
 class CPUTestBase(DeviceTypeTestBase):
     device_type = 'cpu'

From 1833009202ee5b8e4e25affc325dec87cad87d36 Mon Sep 17 00:00:00 2001
From: Jeffrey Wan <jw3468@fb.com>
Date: Wed, 23 Dec 2020 14:40:53 -0800
Subject: [PATCH 38/45] Fix typo in complex autograd docs (#49755)

Summary:
Update complex autograd docs to fix a typo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49755

Reviewed By: mruberry

Differential Revision: D25692649

Pulled By: soulitzer

fbshipit-source-id: 43c2113b4c8f2d1828880102189a5a9b887dc784
---
 docs/source/notes/autograd.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/notes/autograd.rst b/docs/source/notes/autograd.rst
index a932306f63e1..625ffa1ba238 100644
--- a/docs/source/notes/autograd.rst
+++ b/docs/source/notes/autograd.rst
@@ -302,8 +302,8 @@ From the above equations, we get:
 
     .. math::
         \begin{aligned}
-            \frac{\partial }{\partial z} &= 1/2 * (\frac{\partial }{\partial x} - 1j * \frac{\partial z}{\partial y})   \\
-            \frac{\partial }{\partial z^*} &= 1/2 * (\frac{\partial }{\partial x} + 1j * \frac{\partial z}{\partial y})
+            \frac{\partial }{\partial z} &= 1/2 * (\frac{\partial }{\partial x} - 1j * \frac{\partial }{\partial y})   \\
+            \frac{\partial }{\partial z^*} &= 1/2 * (\frac{\partial }{\partial x} + 1j * \frac{\partial }{\partial y})
         \end{aligned}
 
 which is the classic definition of Wirtinger calculus that you would find on `Wikipedia <https://en.wikipedia.org/wiki/Wirtinger_derivatives>`_.
@@ -397,8 +397,8 @@ Solving the above equations for :math:`\frac{\partial L}{\partial u}` and :math:
 
     .. math::
         \begin{aligned}
-            \frac{\partial L}{\partial u} = 1/2 * (\frac{\partial L}{\partial s} + \frac{\partial L}{\partial s^*}) \\
-            \frac{\partial L}{\partial v} = -1/2j * (\frac{\partial L}{\partial s} - \frac{\partial L}{\partial s^*})
+            \frac{\partial L}{\partial u} = \frac{\partial L}{\partial s} + \frac{\partial L}{\partial s^*} \\
+            \frac{\partial L}{\partial v} = -1j * (\frac{\partial L}{\partial s} - \frac{\partial L}{\partial s^*})
         \end{aligned}
         :label: [3]
 
@@ -406,8 +406,8 @@ Substituting :eq:`[3]` in :eq:`[1]`, we get:
 
     .. math::
         \begin{aligned}
-            \frac{\partial L}{\partial z^*} &= 1/2 * (\frac{\partial L}{\partial s} + \frac{\partial L}{\partial s^*}) * \frac{\partial u}{\partial z^*} - 1/2j * (\frac{\partial L}{\partial s} - \frac{\partial L}{\partial s^*}) * \frac{\partial v}{\partial z^*}  \\
-                                            &= \frac{\partial L}{\partial s} * 1/2 * (\frac{\partial u}{\partial z^*} + \frac{\partial v}{\partial z^*} j) + \frac{\partial L}{\partial s^*} * 1/2 * (\frac{\partial u}{\partial z^*} - \frac{\partial v}{\partial z^*} j)  \\
+            \frac{\partial L}{\partial z^*} &= (\frac{\partial L}{\partial s} + \frac{\partial L}{\partial s^*}) * \frac{\partial u}{\partial z^*} - 1j * (\frac{\partial L}{\partial s} - \frac{\partial L}{\partial s^*}) * \frac{\partial v}{\partial z^*}  \\
+                                            &= \frac{\partial L}{\partial s} * (\frac{\partial u}{\partial z^*} + \frac{\partial v}{\partial z^*} j) + \frac{\partial L}{\partial s^*} * (\frac{\partial u}{\partial z^*} - \frac{\partial v}{\partial z^*} j)  \\
                                             &= \frac{\partial L}{\partial s^*} * \frac{\partial (u + vj)}{\partial z^*} + \frac{\partial L}{\partial s} * \frac{\partial (u + vj)^*}{\partial z^*}  \\
                                             &= \frac{\partial L}{\partial s} * \frac{\partial s}{\partial z^*} + \frac{\partial L}{\partial s^*} * \frac{\partial s^*}{\partial z^*}    \\
         \end{aligned}

From 5acc27c00a131c3e8ce4dad6e06222bb40f30dc7 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Wed, 23 Dec 2020 15:23:57 -0800
Subject: [PATCH 39/45] Revert D25690129: [pytorch][PR] Added linalg.inv

Test Plan: revert-hammer

Differential Revision:
D25690129 (https://github.com/pytorch/pytorch/commit/8554b58fbdd865c760d92bfa50c1119cc8fc65e9)

Original commit changeset: edb2d03721f2

fbshipit-source-id: 8679ea18e637423d35919544d2b047a62ac3abd8
---
 aten/src/ATen/cuda/CUDABlas.cpp               |  22 ++--
 aten/src/ATen/cuda/CUDABlas.h                 |   2 +-
 aten/src/ATen/native/BatchLinearAlgebra.cpp   | 110 +++--------------
 .../ATen/native/cuda/BatchLinearAlgebra.cu    | 109 +++++------------
 .../ATen/native/cuda/BatchLinearAlgebraLib.cu |  85 ++++---------
 .../ATen/native/cuda/BatchLinearAlgebraLib.h  |   1 -
 aten/src/ATen/native/cuda/LinearAlgebra.cu    |   4 +-
 aten/src/ATen/native/native_functions.yaml    |  21 ----
 docs/source/linalg.rst                        |   1 -
 test/test_linalg.py                           | 113 +++++-------------
 tools/autograd/derivatives.yaml               |   3 -
 tools/autograd/gen_variable_type.py           |   2 +-
 torch/csrc/api/include/torch/linalg.h         |  19 ---
 torch/linalg/__init__.py                      |  59 ---------
 torch/overrides.py                            |   1 -
 .../_internal/common_methods_invocations.py   |  33 -----
 16 files changed, 121 insertions(+), 464 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 0521adf669c5..8c32c8db1a1c 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -831,18 +831,18 @@ void getrfBatched<c10::complex<float>>(
 
 template <>
 void getriBatched<double>(
-    int n, double** dA_array, int ldda, int* ipiv_array, double** dC_array, int lddc, int* info_array, int batchsize) {
+    int n, double** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize, double** dC_array) {
   auto handle = at::cuda::getCurrentCUDABlasHandle();
   TORCH_CUDABLAS_CHECK(cublasDgetriBatched(
-      handle, n, dA_array, ldda, ipiv_array, dC_array, lddc, info_array, batchsize));
+      handle, n, dA_array, ldda, ipiv_array, dC_array, n, info_array, batchsize));
 }
 
 template <>
 void getriBatched<float>(
-    int n, float** dA_array, int ldda, int* ipiv_array, float** dC_array, int lddc, int* info_array, int batchsize) {
+    int n, float** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize, float** dC_array) {
   auto handle = at::cuda::getCurrentCUDABlasHandle();
   TORCH_CUDABLAS_CHECK(cublasSgetriBatched(
-      handle, n, dA_array, ldda, ipiv_array, dC_array, lddc, info_array, batchsize));
+      handle, n, dA_array, ldda, ipiv_array, dC_array, n, info_array, batchsize));
 }
 
 template <>
@@ -851,10 +851,9 @@ void getriBatched<c10::complex<double>>(
     c10::complex<double>** dA_array,
     int ldda,
     int* ipiv_array,
-    c10::complex<double>** dC_array,
-    int lddc,
     int* info_array,
-    int batchsize) {
+    int batchsize,
+    c10::complex<double>** dC_array) {
   auto handle = at::cuda::getCurrentCUDABlasHandle();
   TORCH_CUDABLAS_CHECK(cublasZgetriBatched(
       handle,
@@ -863,7 +862,7 @@ void getriBatched<c10::complex<double>>(
       ldda,
       ipiv_array,
       reinterpret_cast<cuDoubleComplex**>(dC_array),
-      lddc,
+      n,
       info_array,
       batchsize));
 }
@@ -874,10 +873,9 @@ void getriBatched<c10::complex<float>>(
     c10::complex<float>** dA_array,
     int ldda,
     int* ipiv_array,
-    c10::complex<float>** dC_array,
-    int lddc,
     int* info_array,
-    int batchsize) {
+    int batchsize,
+    c10::complex<float>** dC_array) {
   auto handle = at::cuda::getCurrentCUDABlasHandle();
   TORCH_CUDABLAS_CHECK(cublasCgetriBatched(
       handle,
@@ -886,7 +884,7 @@ void getriBatched<c10::complex<float>>(
       ldda,
       ipiv_array,
       reinterpret_cast<cuComplex**>(dC_array),
-      lddc,
+      n,
       info_array,
       batchsize));
 }
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
index d44fc49c589a..93a0ff588dda 100644
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@@ -175,7 +175,7 @@ void getrfBatched<c10::complex<float>>(CUDABLAS_GETRF_ARGTYPES(c10::complex<floa
 
 
 #define CUDABLAS_GETRI_ARGTYPES(Dtype)  \
-  int n, Dtype** dA_array, int ldda, int* ipiv_array, Dtype** dC_array, int lddc, int* info_array, int batchsize
+  int n, Dtype** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize, Dtype** dC_array
 
 template<class Dtype>
 void getriBatched(CUDABLAS_GETRI_ARGTYPES(Dtype)) {
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 7bbdd8072a11..cc1403febf90 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -528,15 +528,8 @@ Tensor linalg_solve(const Tensor& input, const Tensor& other) {
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ inverse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-/*
-Computes the inverse of n-by-n matrix 'self'
-This is an in-place routine, content of 'self' is overriden.
-'infos_lu' and 'infos_getri' are int Tensors containing error codes for each matrix in the batched input.
-'infos_lu' is for holding lapackLU errors, and 'infos_getri' is for holding lapackGetri errors.
-For more information see LAPACK's documentation for GETRI and GETRF routines.
-*/
 template <typename scalar_t>
-static void apply_inverse(Tensor& self, Tensor& infos_lu, Tensor& infos_getri) {
+static void apply_inverse(Tensor& self, std::vector<int64_t>& infos) {
 #ifndef USE_LAPACK
   AT_ERROR("inverse: LAPACK library not found in compilation");
 #else
@@ -545,12 +538,9 @@ static void apply_inverse(Tensor& self, Tensor& infos_lu, Tensor& infos_getri) {
   auto self_matrix_stride = matrixStride(self);
   auto batch_size = batchCount(self);
   auto n = self.size(-2);
-  auto lda = std::max<int64_t>(1, n);
 
-  auto ipiv = at::empty({lda}, self.options().dtype(kInt));
+  auto ipiv = at::empty({n}, self.options().dtype(kInt));
   auto ipiv_data = ipiv.data_ptr<int>();
-  auto infos_lu_data = infos_lu.data_ptr<int>();
-  auto infos_getri_data = infos_getri.data_ptr<int>();
 
   int info;
   // Run once, first to get the optimum work size
@@ -559,36 +549,39 @@ static void apply_inverse(Tensor& self, Tensor& infos_lu, Tensor& infos_getri) {
   // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
   int lwork = -1;
   scalar_t wkopt;
-  lapackGetri<scalar_t>(n, self_data, lda, ipiv_data, &wkopt, lwork, &info);
+  lapackGetri<scalar_t>(n, self_data, n, ipiv_data, &wkopt, lwork, &info);
   lwork = static_cast<int>(real_impl<scalar_t, value_t>(wkopt));
   Tensor work = at::empty({lwork}, self.options());
   auto work_data = work.data_ptr<scalar_t>();
 
   for (int64_t i = 0; i < batch_size; i++) {
     scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
-    int* info_lu_working_ptr = &infos_lu_data[i];
-    lapackLu<scalar_t>(n, n, self_working_ptr, lda, ipiv_data, info_lu_working_ptr);
+    lapackLu<scalar_t>(n, n, self_working_ptr, n, ipiv_data, &info);
+    infos[i] = info;
+    if (info != 0) {
+      return;
+    }
 
     // now compute the actual inverse
-    int* info_getri_working_ptr = &infos_getri_data[i];
-    lapackGetri<scalar_t>(n, self_working_ptr, lda, ipiv_data, work_data, lwork, info_getri_working_ptr);
+    lapackGetri<scalar_t>(n, self_working_ptr, n, ipiv_data, work_data, lwork, &info);
+    infos[i] = info;
+    if (info != 0) {
+      return;
+    }
   }
 #endif
 }
 
 Tensor _inverse_helper_cpu(const Tensor& self) {
-  auto infos_lu = at::empty({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
-  auto infos_getri = at::empty({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
+  std::vector<int64_t> infos(batchCount(self), 0);
   auto self_working_copy = cloneBatchedColumnMajor(self);
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "inverse_cpu", [&]{
-    apply_inverse<scalar_t>(self_working_copy, infos_lu, infos_getri);
+    apply_inverse<scalar_t>(self_working_copy, infos);
   });
   if (self.dim() > 2) {
-    batchCheckErrors(infos_lu, "inverse_cpu");
-    batchCheckErrors(infos_getri, "inverse_cpu");
+    batchCheckErrors(infos, "inverse_cpu");
   } else {
-    singleCheckErrors(infos_lu.item().toInt(), "inverse_cpu");
-    singleCheckErrors(infos_getri.item().toInt(), "inverse_cpu");
+    singleCheckErrors(infos[0], "inverse_cpu");
   }
   return self_working_copy;
 }
@@ -609,75 +602,6 @@ Tensor& inverse_out(Tensor &result, const Tensor &self) {
   return result;
 }
 
-// This is a type dispatching helper function for 'apply_inverse'
-Tensor& _linalg_inv_out_helper_cpu(Tensor &result, Tensor& infos_lu, Tensor& infos_getri) {
-  // This function calculates the inverse matrix in-place
-  // result should be in column major order and contain matrices to invert
-  // the content of result is overriden by 'apply_inverse'
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cpu", [&]{
-    apply_inverse<scalar_t>(result, infos_lu, infos_getri);
-  });
-  return result;
-}
-
-// Computes the inverse matrix of 'input', it is is saved to 'result' in-place
-// LAPACK/MAGMA/cuSOLVER error codes are saved in 'infos' tensors, they are not checked here
-static Tensor& linalg_inv_out_info(Tensor& result, Tensor& infos_lu, Tensor& infos_getri, const Tensor& input) {
-  squareCheckInputs(input);
-  TORCH_INTERNAL_ASSERT(infos_lu.scalar_type() == kInt);
-  TORCH_INTERNAL_ASSERT(infos_getri.scalar_type() == kInt);
-  TORCH_CHECK(result.scalar_type() == input.scalar_type(),
-    "result dtype ", result.scalar_type(), " does not match input dtype ", input.scalar_type());
-  TORCH_CHECK(result.device() == input.device(),
-    "result device ", result.device(), " does not match input device ", input.device());
-
-  // if result has no elements we can modify it
-  if (result.numel() == 0) {
-    at::native::resize_as_(result, input.transpose(-2, -1), MemoryFormat::Contiguous);
-    result.transpose_(-2, -1);
-  } else {
-    // Resize messes up the strides and we expect strictly column major order, so let's not use at::native::resize_output
-    TORCH_CHECK(result.sizes().equals(input.sizes()),
-    "result shape ", result.sizes(), " does not match input shape ", input.sizes());
-  }
-
-  TORCH_CHECK(result.transpose(-2, -1).is_contiguous(), "result tensor must be in batched column major order (Fortran contiguous).");
-  result.copy_(input);
-
-  at::native::resize_output(infos_lu, {std::max<int64_t>(1, batchCount(input))});
-  at::native::resize_output(infos_getri, {std::max<int64_t>(1, batchCount(input))});
-  infos_lu.fill_(0);
-  infos_getri.fill_(0);
-
-  result = at::_linalg_inv_out_helper_(result, infos_lu, infos_getri);
-  return result;
-}
-
-// Computes the inverse matrix of 'input', it is is saved to 'result' in-place
-Tensor& linalg_inv_out(Tensor &result, const Tensor &input) {
-  auto infos_lu = at::empty({0}, input.options().dtype(kInt));
-  auto infos_getri = at::empty({0}, input.options().dtype(kInt));
-  result = linalg_inv_out_info(result, infos_lu, infos_getri, input);
-
-  // Now check LAPACK/MAGMA/cuSOLVER error codes
-  if (result.dim() > 2) {
-    batchCheckErrors(infos_lu, "linalg_inv_lu");
-    batchCheckErrors(infos_getri, "linalg_inv_getri");
-  } else {
-    singleCheckErrors(infos_lu.item().toInt(), "linalg_inv_lu");
-    singleCheckErrors(infos_getri.item().toInt(), "linalg_inv_getri");
-  }
-
-  return result;
-}
-
-// Computes the inverse matrix of 'input'
-Tensor linalg_inv(const Tensor &input) {
-  Tensor result = at::empty({0}, input.options());
-  result = at::linalg_inv_out(result, input);
-  return result;
-}
-
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cholesky_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template<typename scalar_t>
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index 252ab57048be..e5804ba389c5 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -1227,14 +1227,8 @@ Tensor& _linalg_solve_out_helper_cuda(Tensor& result, Tensor& input, Tensor& inf
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ inverse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-/*
-Computes the inverse of n-by-n matrix 'self', it is saved to 'self_inv'.
-'infos' is an int Tensor containing error codes for each matrix in the batched input.
-'infos_lu' is for holding magmaLU errors, and 'infos_getri' is for holding magmaGetri errors
-For more information see MAGMA's documentation for GETRI and GETRF routines.
-*/
 template <typename scalar_t>
-static void apply_batched_inverse(Tensor& self, Tensor& self_inv, Tensor& infos_lu, Tensor& infos_getri) {
+static void apply_batched_inverse(Tensor& self, Tensor& self_inv, std::vector<int64_t>& infos) {
 #ifndef USE_MAGMA
 AT_ERROR("inverse: MAGMA library not found in "
     "compilation. Please rebuild with MAGMA.");
@@ -1244,24 +1238,20 @@ AT_ERROR("inverse: MAGMA library not found in "
   auto self_inv_data = self_inv.data_ptr<scalar_t>();
   auto self_inv_mat_stride = matrixStride(self_inv);
 
-  auto infos_lu_data = infos_lu.data_ptr<magma_int_t>();
-  auto infos_getri_data = infos_getri.data_ptr<magma_int_t>();
-
   magma_int_t batch_size = magma_int_cast(batchCount(self), "batchCount");
   magma_int_t n = magma_int_cast(self.size(-2), "self.size(-2)");
-  magma_int_t lda = std::max<magma_int_t>(1, n);
 
+  magma_int_t* info_array;
   magma_int_t* ipiv_data;
   magma_int_t** ipiv_array;
   scalar_t** self_array;
   scalar_t** self_inv_array;
 
-  magma_int_t batch_size_or_one = std::max<magma_int_t>(1, batch_size);
-
-  ALLOCATE_ARRAY(ipiv_data, magma_int_t, batch_size_or_one * lda);
-  ALLOCATE_ARRAY(ipiv_array, magma_int_t*, batch_size_or_one);
-  ALLOCATE_ARRAY(self_array, scalar_t*, batch_size_or_one);
-  ALLOCATE_ARRAY(self_inv_array, scalar_t*, batch_size_or_one);
+  ALLOCATE_ARRAY(info_array, magma_int_t, batch_size);
+  ALLOCATE_ARRAY(ipiv_data, magma_int_t, batch_size * n);
+  ALLOCATE_ARRAY(ipiv_array, magma_int_t*, batch_size);
+  ALLOCATE_ARRAY(self_array, scalar_t*, batch_size);
+  ALLOCATE_ARRAY(self_inv_array, scalar_t*, batch_size);
 
   // Set up the created arrays
   for (int64_t i = 0; i < batch_size; i++) {
@@ -1272,7 +1262,7 @@ AT_ERROR("inverse: MAGMA library not found in "
 
   MAGMAQueue magma_queue(self.get_device());
   magmaLuBatched<scalar_t>(
-    n, n, self_array, lda, ipiv_array, infos_lu_data,
+    n, n, self_array, n, ipiv_array, info_array,
     batch_size, magma_queue);
 
   constexpr int64_t batch_limit = 65535;
@@ -1284,67 +1274,67 @@ AT_ERROR("inverse: MAGMA library not found in "
     scalar_t** self_array_cur = &self_array[mini_idx];
     scalar_t** self_inv_array_cur = &self_inv_array[mini_idx];
     magma_int_t** ipiv_array_cur = &ipiv_array[mini_idx];
-    magma_int_t* info_array_cur_getri = &infos_getri_data[mini_idx];
+    magma_int_t* info_array_cur = &info_array[mini_idx];
 
     magmaGetriBatched<scalar_t>(
-      n, self_array_cur, lda, ipiv_array_cur, self_inv_array_cur,
-      lda, info_array_cur_getri, batch_limit, magma_queue);
+      n, self_array_cur, n, ipiv_array_cur, self_inv_array_cur,
+      n, info_array_cur, batch_limit, magma_queue);
   }
 
   // Compute whatever is left = batch_size - floor(batch_size / batch_limit) * batch_limit
   // which concisely is equal to batch_size % batch_limit
   if (batch_size % batch_limit != 0) {
     magmaGetriBatched<scalar_t>(
-      n, &self_array[mini_idx], lda, &ipiv_array[mini_idx], &self_inv_array[mini_idx],
-      lda, &infos_getri_data[mini_idx], batch_size % batch_limit, magma_queue);
+      n, &self_array[mini_idx], n, &ipiv_array[mini_idx], &self_inv_array[mini_idx],
+      n, &info_array[mini_idx], batch_size % batch_limit, magma_queue);
+  }
+
+  for (int64_t i = 0; i < batch_size; i++) {
+    infos[i] = info_array[i];
   }
 #endif
 }
 
 template <typename scalar_t>
-static void apply_single_inverse(Tensor& self, Tensor& infos_lu, Tensor& infos_getri) {
+static void apply_single_inverse(Tensor& self, int64_t& info) {
 #ifndef USE_MAGMA
 AT_ERROR("inverse: MAGMA library not found in "
     "compilation. Please rebuild with MAGMA.");
 #else
   auto self_data = self.data_ptr<scalar_t>();
   magma_int_t n = magma_int_cast(self.size(-2), "self.size(-2)");
-  magma_int_t lda = std::max<magma_int_t>(1, n);
   magma_int_t lwork = n * magmaGetriOptimalBlocksize<scalar_t>(n);
+  magma_int_t info_tmp = 0;
 
-  // magmaLu and magmaGetri requires infos tensor to live on CPU
-  infos_lu = infos_lu.to(at::kCPU);
-  infos_getri = infos_getri.to(at::kCPU);
-
-  Tensor ipiv = at::empty({lda}, at::kInt);
+  Tensor ipiv = at::empty({n}, at::kInt);
   Tensor dwork = at::empty({lwork}, self.options());
-  magmaLu<scalar_t>(n, n, self_data, lda, ipiv.data_ptr<magma_int_t>(), infos_lu.data_ptr<magma_int_t>());
+  magmaLu<scalar_t>(n, n, self_data, n, ipiv.data_ptr<magma_int_t>(), &info_tmp);
+  if (info_tmp != 0) {
+    info = info_tmp;
+    return;
+  }
   magmaGetri<scalar_t>(
-    n, self_data, lda, ipiv.data_ptr<magma_int_t>(), dwork.data_ptr<scalar_t>(), lwork, infos_getri.data_ptr<magma_int_t>());
+    n, self_data, n, ipiv.data_ptr<magma_int_t>(), dwork.data_ptr<scalar_t>(), lwork, &info_tmp);
+  info = info_tmp;
 #endif
 }
 
 Tensor _inverse_helper_cuda_legacy(const Tensor& self) {
   auto self_inv_working_copy = cloneBatchedColumnMajor(self);
   if (self.dim() > 2) {
-    auto infos_lu = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
-    auto infos_getri = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
+    std::vector<int64_t> infos(batchCount(self), 0);
     auto self_working_copy = cloneBatchedColumnMajor(self);
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "inverse_cuda", [&]{
       apply_batched_inverse<scalar_t>(
-        self_working_copy, self_inv_working_copy, infos_lu, infos_getri);
+        self_working_copy, self_inv_working_copy, infos);
     });
-    batchCheckErrors(infos_lu, "inverse_cuda");
-    batchCheckErrors(infos_getri, "inverse_cuda");
+    batchCheckErrors(infos, "inverse_cuda");
   } else {
-    // magmaLu and magmaGetri requires infos tensor to live on CPU
-    auto infos_lu = at::zeros({1}, self.options().dtype(kInt).device(kCPU));
-    auto infos_getri = at::zeros({1}, self.options().dtype(kInt).device(kCPU));
+    int64_t info = 0;
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "inverse_cuda", [&]{
-      apply_single_inverse<scalar_t>(self_inv_working_copy, infos_lu, infos_getri);
+      apply_single_inverse<scalar_t>(self_inv_working_copy, info);
     });
-    singleCheckErrors(infos_lu.item().toInt(), "inverse_cuda");
-    singleCheckErrors(infos_getri.item().toInt(), "inverse_cuda");
+    singleCheckErrors(info, "inverse_cuda");
   }
   return self_inv_working_copy;
 }
@@ -1361,39 +1351,6 @@ Tensor _inverse_helper_cuda(const Tensor& self) {
 #endif
 }
 
-// This is a type dispatching helper function for 'apply_batched_inverse' and 'singleCheckErrors'
-Tensor& _linalg_inv_out_helper_cuda_legacy(Tensor& result, Tensor& infos_lu, Tensor& infos_getri) {
-  // assuming result is in column major order and contains the matrices to invert
-  if (result.dim() > 2) {
-    auto input_working_copy = cloneBatchedColumnMajor(result);
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cuda", [&]{
-      apply_batched_inverse<scalar_t>(
-        input_working_copy, result, infos_lu, infos_getri);
-    });
-  } else {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cuda", [&]{
-      apply_single_inverse<scalar_t>(result, infos_lu, infos_getri);
-    });
-  }
-  return result;
-}
-
-// This is a MAGMA/cuSOLVER dispatching helper function
-Tensor& _linalg_inv_out_helper_cuda(Tensor &result, Tensor& infos_lu, Tensor& infos_getri) {
-  // This function calculates the inverse matrix in-place
-  // result should be in column major order and contain matrices to invert
-#ifdef USE_CUSOLVER
-  if ((result.dim() == 2) || (/* result.dim() > 2 && */ batchCount(result) <= 2) || !use_magma_) {
-    return _linalg_inv_out_helper_cuda_lib(result, infos_lu, infos_getri);  // cusolver or cublas
-  } else {
-    return _linalg_inv_out_helper_cuda_legacy(result, infos_lu, infos_getri);  // magma-cuda
-  }
-#else
-  return _linalg_inv_out_helper_cuda_legacy(result, infos_lu, infos_getri);  // magma-cuda
-#endif
-  return result;
-}
-
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cholesky_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <typename scalar_t>
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu
index 534f257d55bb..37c360357e82 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu
@@ -26,31 +26,28 @@ inline static Tensor column_major_identity_matrix_like(const Tensor& self) {
 }
 
 template <typename scalar_t>
-inline static void _apply_single_inverse_helper(scalar_t* self_ptr, scalar_t* self_inv_ptr, int* ipiv_ptr, int* info_getrf_ptr, int* info_getrs_ptr, int n, int lda) {
+inline static void _apply_single_inverse_helper(scalar_t* self_ptr, scalar_t* self_inv_ptr, int* ipiv_ptr, int* info_ptr, int n) {
   // self_inv_ptr should already be an identity matrix
 
   auto handle = at::cuda::getCurrentCUDASolverDnHandle();
-  at::cuda::solver::getrf<scalar_t>(handle, n, n, self_ptr, lda, ipiv_ptr, info_getrf_ptr);
-  at::cuda::solver::getrs<scalar_t>(handle, n, n, self_ptr, lda, ipiv_ptr, self_inv_ptr, lda, info_getrs_ptr);
+  at::cuda::solver::getrf<scalar_t>(handle, n, n, self_ptr, n, ipiv_ptr, info_ptr);
+  at::cuda::solver::getrs<scalar_t>(handle, n, n, self_ptr, n, ipiv_ptr, self_inv_ptr, n, info_ptr + 1);
 }
 
 template <typename scalar_t>
-static void apply_batched_inverse_lib(Tensor& self, Tensor& self_inv, Tensor& infos_getrf, Tensor& infos_getrs) {
+static void apply_batched_inverse_lib(Tensor& self, Tensor& self_inv, Tensor& infos) {
   const int batch_size = cuda_int_cast(batchCount(self), "batchCount");
   const int n = cuda_int_cast(self.size(-2), "self.size(-2)");
-  const int lda = std::max<int>(1, n);
 
   auto self_data = self.data_ptr<scalar_t>();
   auto self_mat_stride = matrixStride(self);
   auto self_inv_data = self_inv.data_ptr<scalar_t>();
   auto self_inv_mat_stride = matrixStride(self_inv);
 
-  auto infos_getrf_data = infos_getrf.data_ptr<int>();
-  auto infos_getrs_data = infos_getrs.data_ptr<int>();
-
   auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
 
   if (use_loop_launch(batch_size, n)) {
+    int* p_infos = infos.data_ptr<int>();
     auto main_stream = at::cuda::getCurrentCUDAStream();
 
     at::cuda::CUDAEvent main_event;
@@ -62,14 +59,10 @@ static void apply_batched_inverse_lib(Tensor& self, Tensor& self_inv, Tensor& in
 
       main_event.block(stream);
 
-      auto dataPtr = allocator.allocate(sizeof(int) * lda);
+      auto dataPtr = allocator.allocate(sizeof(int) * n);
       int* pivot = reinterpret_cast<int*>(dataPtr.get());
-
-      int* infos_getrf_working_ptr = &infos_getrf_data[i];
-      int* infos_getrs_working_ptr = &infos_getrs_data[i];
-
       _apply_single_inverse_helper<scalar_t>(
-        &self_data[i * self_mat_stride], &self_inv_data[i * self_inv_mat_stride], pivot, infos_getrf_working_ptr, infos_getrs_working_ptr, n, lda);
+        &self_data[i * self_mat_stride], &self_inv_data[i * self_inv_mat_stride], pivot, p_infos + i * 2, n);
 
       at::cuda::CUDAEvent finished;
       finished.record(stream);
@@ -86,52 +79,30 @@ static void apply_batched_inverse_lib(Tensor& self, Tensor& self_inv, Tensor& in
       reinterpret_cast<long>(&self_inv_data[(batch_size-1) * self_inv_mat_stride]) + 1,
       static_cast<long>(self_inv_mat_stride * sizeof(scalar_t)), self.options().dtype(at::kLong));
 
-    auto dataPtr = allocator.allocate(sizeof(int)*batch_size*lda);
+    auto dataPtr = allocator.allocate(sizeof(int)*batch_size*n);
     int* ipiv_array = reinterpret_cast<int*>(dataPtr.get());
 
-    at::cuda::blas::getrfBatched<scalar_t>(n, reinterpret_cast<scalar_t**>(self_array.data_ptr()), lda,
-      ipiv_array, infos_getrf_data, batch_size);
+    Tensor _info1 = at::zeros({batch_size}, self.options().dtype(at::kInt));
+    Tensor _info2 = at::zeros({batch_size}, self.options().dtype(at::kInt));
+
+    at::cuda::blas::getrfBatched<scalar_t>(n, reinterpret_cast<scalar_t**>(self_array.data_ptr()), n,
+      ipiv_array, _info1.data_ptr<int>(), batch_size);
 
-    at::cuda::blas::getriBatched<scalar_t>(n, reinterpret_cast<scalar_t**>(self_array.data_ptr()), lda,
-      ipiv_array, reinterpret_cast<scalar_t**>(self_inv_array.data_ptr()), lda, infos_getrs_data, batch_size);
+    at::cuda::blas::getriBatched<scalar_t>(n, reinterpret_cast<scalar_t**>(self_array.data_ptr()), n,
+      ipiv_array, _info2.data_ptr<int>(), batch_size, reinterpret_cast<scalar_t**>(self_inv_array.data_ptr()));
+
+    infos = at::stack({_info1, _info2}, 1);
   }
 }
 
 template <typename scalar_t>
-static void apply_single_inverse_lib(const Tensor& self, Tensor& self_inv, Tensor& infos_getrf, Tensor& infos_getrs) {
+static void apply_single_inverse_lib(const Tensor& self, Tensor& self_inv, Tensor& info) {
   int n = cuda_int_cast(self.size(-2), "self.size(-2)");
-  int lda = std::max<int>(1, n);
 
-  Tensor ipiv = at::empty({lda}, self.options().dtype(at::kInt));
+  Tensor ipiv = at::empty({n}, self.options().dtype(at::kInt));
 
   _apply_single_inverse_helper<scalar_t>(
-    self.data_ptr<scalar_t>(), self_inv.data_ptr<scalar_t>(), ipiv.data_ptr<int>(), infos_getrf.data_ptr<int>(), infos_getrs.data_ptr<int>(), n, lda);
-}
-
-// This is a type dispatching helper function for 'apply_batched_inverse_lib' and 'apply_single_inverse_lib'
-Tensor& _linalg_inv_out_helper_cuda_lib(Tensor& result, Tensor& infos_getrf, Tensor& infos_getrs) {
-  // assuming result is in column major order and contains the matrices to invert
-  Tensor input_working_copy = cloneBatchedColumnMajor(result);
-
-  // for getrf + getrs (cusolver path)
-  // result should be filled with identity matrices
-  result.zero_();
-  result.diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).fill_(1);
-
-  const int batch_size = cuda_int_cast(batchCount(result), "batchCount");
-
-  if (result.dim() > 2) {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cuda", [&]{
-      apply_batched_inverse_lib<scalar_t>(
-        input_working_copy, result, infos_getrf, infos_getrs);
-    });
-  } else {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cuda", [&]{
-      apply_single_inverse_lib<scalar_t>(input_working_copy, result, infos_getrf, infos_getrs);
-    });
-  }
-
-  return result;
+    self.data_ptr<scalar_t>(), self_inv.data_ptr<scalar_t>(), ipiv.data_ptr<int>(), info.data_ptr<int>(), n);
 }
 
 Tensor _inverse_helper_cuda_lib(const Tensor& self) {
@@ -140,22 +111,18 @@ Tensor _inverse_helper_cuda_lib(const Tensor& self) {
   const int batch_size = cuda_int_cast(batchCount(self), "batchCount");
 
   if (self.dim() > 2 && batch_size > 1) {
-    Tensor infos_getrf = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
-    Tensor infos_getrs = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
+    Tensor infos = at::zeros({batchCount(self) * 2}, self.options().dtype(kInt));
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "inverse_cuda", [&]{
       apply_batched_inverse_lib<scalar_t>(
-        self_working_copy, self_inv_working_copy, infos_getrf, infos_getrs);
+        self_working_copy, self_inv_working_copy, infos);
     });
-    batchCheckErrors(infos_getrf, "inverse_cuda");
-    batchCheckErrors(infos_getrs, "inverse_cuda");
+    batchCheckErrors(infos, "inverse_cuda", false, 2);
   } else {
-    Tensor infos_getrf = at::zeros({1}, self.options().dtype(kInt));
-    Tensor infos_getrs = at::zeros({1}, self.options().dtype(kInt));
+    Tensor info = at::zeros({2}, self.options().dtype(at::kInt));
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "inverse_cuda", [&]{
-      apply_single_inverse_lib<scalar_t>(self_working_copy, self_inv_working_copy, infos_getrf, infos_getrs);
+      apply_single_inverse_lib<scalar_t>(self_working_copy, self_inv_working_copy, info);
     });
-    batchCheckErrors(infos_getrf, "inverse_cuda");
-    batchCheckErrors(infos_getrs, "inverse_cuda");
+    batchCheckErrors(info, "inverse_cuda", false, 2);
   }
 
   return self_inv_working_copy;
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
index 2be18137a64f..dc6dc2f9daca 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
@@ -18,7 +18,6 @@ namespace at {
 namespace native {
 
 Tensor _inverse_helper_cuda_lib(const Tensor& self);
-Tensor& _linalg_inv_out_helper_cuda_lib(Tensor& result, Tensor& infos_getrf, Tensor& infos_getrs);
 
 }}  // namespace at::native
 
diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
index 69a366cc9cd5..88e4d2f9a8e3 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -47,9 +47,7 @@ Tensor prepare_batch_matrix_for_cublas(const Tensor& tensor, bool& transpose_ten
     ld_tensor = tensor_strides[fast_dim];
   } else {
     transpose_tensor = !transpose_result;
-    // gemm call requires leading dimension and stride parameters to be non-zero
-    bool is_stride_non_zero = tensor.stride(1) != 0 && tensor.stride(2) != 0;
-    if (tensor.is_contiguous() && is_stride_non_zero) {
+    if (tensor.is_contiguous()) {
       tensor_ = tensor;
     } else {
       tensor_ = tensor.clone(at::MemoryFormat::Contiguous);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 42edbfe12748..e6bb52490e7d 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -10045,27 +10045,6 @@
   dispatch:
     DefaultBackend: linalg_eigvalsh_out
 
-- func: _linalg_inv_out_helper_(Tensor(a!) self, Tensor(b!) infos_lu, Tensor(c!) infos_getri) -> Tensor(a!)
-  use_c10_dispatcher: full
-  variants: function
-  dispatch:
-    CPU: _linalg_inv_out_helper_cpu
-    CUDA: _linalg_inv_out_helper_cuda
-
-- func: linalg_inv(Tensor self) -> Tensor
-  python_module: linalg
-  use_c10_dispatcher: full
-  variants: function
-  dispatch:
-    DefaultBackend: linalg_inv
-
-- func: linalg_inv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  python_module: linalg
-  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
-  variants: function
-  dispatch:
-    DefaultBackend: linalg_inv_out
-
 - func: inner(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst
index 761d0e97a1e9..36f91627d522 100644
--- a/docs/source/linalg.rst
+++ b/docs/source/linalg.rst
@@ -22,4 +22,3 @@ Functions
 .. autofunction:: solve
 .. autofunction:: tensorinv
 .. autofunction:: tensorsolve
-.. autofunction:: inv
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 742bfc9e35be..8402be471a88 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -1993,19 +1993,18 @@ def func(root, b, upper):
     @skipCUDAIfNoMagmaAndNoCusolver
     @skipCPUIfNoLapack
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
-    @precisionOverride({torch.float32: 2e-3, torch.complex64: 2e-3,
-                        torch.float64: 1e-8, torch.complex128: 1e-8})
+    @precisionOverride({torch.float32: 2e-3, torch.complex64: 2e-3})
     def test_inverse(self, device, dtype):
         from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value
 
-        def run_test(torch_inverse, matrix, batches, n):
-            matrix_inverse = torch_inverse(matrix)
+        def run_test(matrix, batches, n):
+            matrix_inverse = torch.inverse(matrix)
 
             # Compare against NumPy output
             # NumPy uses 'gesv' LAPACK routine solving the equation A A_inv = I
             # But in PyTorch 'gertf' + 'getri' is used causing element-wise differences
             expected = np.linalg.inv(matrix.cpu().numpy())
-            self.assertEqual(matrix_inverse, expected, atol=self.precision, rtol=self.precision)
+            self.assertEqual(matrix_inverse, expected, atol=self.precision, rtol=1e-4)
 
             # Additional correctness tests, check matrix*matrix_inverse == identity
             identity = torch.eye(n, dtype=dtype, device=device)
@@ -2013,49 +2012,44 @@ def run_test(torch_inverse, matrix, batches, n):
             self.assertEqual(identity.expand_as(matrix), torch.matmul(matrix_inverse, matrix))
 
             # check the out= variant
-            # prepare the expected out tensor
             matrix_inverse_out = torch.empty(*batches, n, n, dtype=dtype, device=device)
-            matrix_inverse_out_t = matrix_inverse_out.transpose(-2, -1).clone(memory_format=torch.contiguous_format)
-            matrix_inverse_out = matrix_inverse_out_t.transpose(-2, -1)
-            ans = torch_inverse(matrix, out=matrix_inverse_out)
+            ans = torch.inverse(matrix, out=matrix_inverse_out)
             self.assertEqual(matrix_inverse_out, ans, atol=0, rtol=0)
             self.assertEqual(matrix_inverse_out, matrix_inverse, atol=0, rtol=0)
 
             # batched matrices: 3+ dimensional tensors, check matrix_inverse same as single-inverse for each matrix
-            if matrix.ndim > 2 and batches[0] != 0:
+            if matrix.ndim > 2:
                 expected_inv_list = []
                 p = int(np.prod(batches))  # use `p` instead of -1, so that the test works for empty input as well
                 for mat in matrix.contiguous().view(p, n, n):
-                    expected_inv_list.append(torch_inverse(mat))
+                    expected_inv_list.append(torch.inverse(mat))
                 expected_inv = torch.stack(expected_inv_list).view(*batches, n, n)
                 if self.device_type == 'cuda' and dtype in [torch.float32, torch.complex64]:
                     # single-inverse is done using cuSOLVER, while batched inverse is done using MAGMA
                     # individual values can be significantly different for fp32, hence rather high rtol is used
-                    # the important thing is that torch_inverse passes above checks with identity
+                    # the important thing is that torch.inverse passes above checks with identity
                     self.assertEqual(matrix_inverse, expected_inv, atol=1e-1, rtol=1e-2)
                 else:
                     self.assertEqual(matrix_inverse, expected_inv)
 
-        for torch_inverse in [torch.inverse, torch.linalg.inv]:
-            for batches, n in itertools.product(
-                [[], [0], [1], [4], [2, 3]],
-                [0, 5, 64]
-            ):
-                # large batch size and large matrix size will be tested in test_inverse_many_batches (slow test)
-                if batches and batches[0] == 32 and n == 256:
-                    continue
-                matrices = random_fullrank_matrix_distinct_singular_value(n, *batches, dtype=dtype).to(device)
-                run_test(torch_inverse, matrices, batches, n)
-
-                # test non-contiguous input
-                run_test(torch_inverse, matrices.transpose(-2, -1), batches, n)
-                if n > 0:
-                    run_test(
-                        torch_inverse,
-                        random_fullrank_matrix_distinct_singular_value(n * 2, *batches, dtype=dtype).to(device)
-                        .view(-1, n * 2, n * 2)[:, ::2, ::2].view(*batches, n, n),
-                        batches, n
-                    )
+        for batches, n in itertools.product(
+            [[], [1], [4], [2, 3]],
+            [0, 5, 64]
+        ):
+            # large batch size and large matrix size will be tested in test_inverse_many_batches (slow test)
+            if batches and batches[0] == 32 and n == 256:
+                continue
+            matrices = random_fullrank_matrix_distinct_singular_value(n, *batches, dtype=dtype).to(device)
+            run_test(matrices, batches, n)
+
+            # test non-contiguous input
+            run_test(matrices.transpose(-2, -1), batches, n)
+            if n > 0:
+                run_test(
+                    random_fullrank_matrix_distinct_singular_value(n * 2, *batches, dtype=dtype).to(device)
+                    .view(-1, n * 2, n * 2)[:, ::2, ::2].view(*batches, n, n),
+                    batches, n
+                )
 
     @slowTest
     @skipCUDAIfNoMagmaAndNoCusolver
@@ -2066,18 +2060,17 @@ def run_test(torch_inverse, matrix, batches, n):
     def test_inverse_many_batches(self, device, dtype):
         from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value
 
-        def test_inverse_many_batches_helper(torch_inverse, b, n):
+        def test_inverse_many_batches_helper(b, n):
             matrices = random_fullrank_matrix_distinct_singular_value(b, n, n, dtype=dtype).to(device)
-            matrices_inverse = torch_inverse(matrices)
+            matrices_inverse = torch.inverse(matrices)
 
             # Compare against NumPy output
             expected = np.linalg.inv(matrices.cpu().numpy())
-            self.assertEqual(matrices_inverse, expected, atol=self.precision, rtol=1e-3)
+            self.assertEqual(matrices_inverse, expected, atol=self.precision, rtol=1e-4)
 
-        for torch_inverse in [torch.inverse, torch.linalg.inv]:
-            test_inverse_many_batches_helper(torch_inverse, 5, 256)
-            test_inverse_many_batches_helper(torch_inverse, 3, 512)
-            test_inverse_many_batches_helper(torch_inverse, 64, 64)
+        test_inverse_many_batches_helper(5, 256)
+        test_inverse_many_batches_helper(3, 512)
+        test_inverse_many_batches_helper(64, 64)
 
     @skipCUDAIfNoMagmaAndNoCusolver
     @skipCPUIfNoLapack
@@ -2098,48 +2091,6 @@ def run_test_singular_input(batch_dim, n):
         for params in [(1, 0), (2, 0), (2, 1), (4, 0), (4, 2), (10, 2)]:
             run_test_singular_input(*params)
 
-    @skipCUDAIfNoMagmaAndNoCusolver
-    @skipCPUIfNoLapack
-    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
-    def test_inv_errors(self, device, dtype):
-        # inv expects batches of square matrices as input
-        a = torch.randn(2, 3, 4, 3, dtype=dtype, device=device)
-        with self.assertRaisesRegex(RuntimeError, "must be batches of square matrices"):
-            torch.linalg.inv(a)
-
-        # inv requires the input to be at least 2 dimensional tensor
-        a = torch.randn(2, device=device, dtype=dtype)
-        with self.assertRaisesRegex(RuntimeError, "must have at least 2 dimensions"):
-            torch.linalg.inv(a)
-
-        # if input is not invertible, RuntimeError is raised mentioning the first non-invertible batch
-        def run_test_singular_input(batch_dim, n):
-            a = torch.eye(3, 3, dtype=dtype, device=device).reshape((1, 3, 3)).repeat(batch_dim, 1, 1)
-            a[n, -1, -1] = 0
-            with self.assertRaisesRegex(RuntimeError, rf"For batch {n}: U\(3,3\) is zero"):
-                torch.linalg.inv(a)
-
-        for params in [(1, 0), (2, 0), (2, 1), (4, 0), (4, 2), (10, 2)]:
-            run_test_singular_input(*params)
-
-        # if non-empty out tensor with wrong shape is passed an error is thrown
-        a = torch.randn(2, 3, 3, device=device, dtype=dtype)
-        out = torch.empty(1, device=device, dtype=dtype)
-        with self.assertRaisesRegex(RuntimeError, "does not match input shape"):
-            torch.linalg.inv(a, out=out)
-
-        # dtypes should match
-        out = torch.empty_like(a).to(torch.int)
-        with self.assertRaisesRegex(RuntimeError, "result dtype Int does not match input dtype"):
-            torch.linalg.inv(a, out=out)
-
-        # device should match
-        if torch.cuda.is_available():
-            wrong_device = 'cpu' if self.device_type != 'cpu' else 'cuda'
-            out = torch.empty(0, device=wrong_device, dtype=dtype)
-            with self.assertRaisesRegex(RuntimeError, "does not match input device"):
-                torch.linalg.inv(a, out=out)
-
     def solve_test_helper(self, A_dims, b_dims, device, dtype):
         from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value
 
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 802eb3fed71a..9f68622e7691 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -596,9 +596,6 @@
 - name: inverse(Tensor self) -> Tensor
   self: -at::matmul(result.conj().transpose(-2, -1), at::matmul(grad, result.conj().transpose(-2, -1)))
 
-- name: linalg_inv(Tensor self) -> Tensor
-  self: -at::matmul(result.conj().transpose(-2, -1), at::matmul(grad, result.conj().transpose(-2, -1)))
-
 - name: isnan(Tensor self) -> Tensor
   self: non_differentiable
 
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 1f1620a1c418..03fbf34034ea 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -78,7 +78,7 @@
     'bmm', 'diagonal', 'alias', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
     'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'take', 'fill_',
     'exp', 'nonzero', 'mean', 'inverse', 'solve', 'linalg_cholesky', 'addcmul', 'addcdiv',
-    'matrix_exp', 'linalg_eigh', 'cholesky_solve', 'qr', 'svd', 'linalg_inv',
+    'matrix_exp', 'linalg_eigh', 'cholesky_solve', 'qr', 'svd',
     '_fft_c2c', '_fft_r2c', 'linalg_solve', 'sqrt'
 }
 
diff --git a/torch/csrc/api/include/torch/linalg.h b/torch/csrc/api/include/torch/linalg.h
index 22e3318d331a..f01755c45a74 100644
--- a/torch/csrc/api/include/torch/linalg.h
+++ b/torch/csrc/api/include/torch/linalg.h
@@ -84,14 +84,6 @@ inline Tensor& tensorsolve_out(Tensor& result, const Tensor& self, const Tensor&
   return torch::linalg_tensorsolve_out(result, self, other, dims);
 }
 
-inline Tensor inv(const Tensor& input) {
-  return torch::linalg_inv(input);
-}
-
-inline Tensor& inv_out(Tensor& result, const Tensor& input) {
-  return torch::linalg_inv_out(result, input);
-}
-
 } // namespace detail
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
@@ -213,15 +205,4 @@ inline Tensor& tensorsolve_out(Tensor& result, const Tensor& input, const Tensor
   return detail::tensorsolve_out(result, input, other, dims);
 }
 
-/// Computes a tensor `inverse_input` such that `dot(input, inverse_input) = eye(input.size(0))`.
-///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.inv
-inline Tensor inv(const Tensor& input) {
-  return detail::inv(input);
-}
-
-inline Tensor& inv_out(Tensor& result, const Tensor& input) {
-  return detail::inv_out(result, input);
-}
-
 }} // torch::linalg
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index f80aa8da045d..575bc92534be 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -82,65 +82,6 @@
     True
 """)
 
-inv = _add_docstr(_linalg.linalg_inv, r"""
-linalg.inv(input, *, out=None) -> Tensor
-
-This function computes the "multiplicative inverse" matrix of a square matrix, or batch of such matrices, :attr:`input`.
-The result satisfies the relation
-
-``matmul(inv(input), input) = matmul(input, inv(input)) = eye(input.shape[0]).expand_as(input)``.
-
-Supports input of float, double, cfloat and cdouble data types.
-
-.. note:: If :attr:`input` is a non-invertible matrix or non-square matrix, or batch with at least one such matrix,
-          then a RuntimeError will be thrown.
-
-.. note:: When given inputs on a CUDA device, this function synchronizes that device with the CPU.
-
-Args:
-    input (Tensor): the square :math:`n \times n` matrix or the batch
-                    of such matrices of size :math:`(*, n, n)` where `*` is one or more batch dimensions.
-
-Keyword args:
-    out (Tensor, optional): The output tensor. Ignored if None. Default: None
-
-Examples::
-
-    >>> x = torch.rand(4, 4)
-    >>> y = torch.linalg.inv(x)
-    >>> z = torch.mm(x, y)
-    >>> z
-    tensor([[ 1.0000, -0.0000, -0.0000,  0.0000],
-            [ 0.0000,  1.0000,  0.0000,  0.0000],
-            [ 0.0000,  0.0000,  1.0000,  0.0000],
-            [ 0.0000, -0.0000, -0.0000,  1.0000]])
-    >>> torch.max(torch.abs(z - torch.eye(4))) # Max non-zero
-    tensor(1.1921e-07)
-
-    >>> # Batched inverse example
-    >>> x = torch.randn(2, 3, 4, 4)
-    >>> y = torch.linalg.inv(x)
-    >>> z = torch.matmul(x, y)
-    >>> torch.max(torch.abs(z - torch.eye(4).expand_as(x))) # Max non-zero
-    tensor(1.9073e-06)
-
-    >>> x = torch.rand(4, 4, dtype=torch.cdouble)
-    >>> y = torch.linalg.inv(x)
-    >>> z = torch.mm(x, y)
-    >>> z
-    tensor([[ 1.0000e+00+0.0000e+00j, -1.3878e-16+3.4694e-16j,
-            5.5511e-17-1.1102e-16j,  0.0000e+00-1.6653e-16j],
-            [ 5.5511e-16-1.6653e-16j,  1.0000e+00+6.9389e-17j,
-            2.2204e-16-1.1102e-16j, -2.2204e-16+1.1102e-16j],
-            [ 3.8858e-16-1.2490e-16j,  2.7756e-17+3.4694e-17j,
-            1.0000e+00+0.0000e+00j, -4.4409e-16+5.5511e-17j],
-            [ 4.4409e-16+5.5511e-16j, -3.8858e-16+1.8041e-16j,
-            2.2204e-16+0.0000e+00j,  1.0000e+00-3.4694e-16j]],
-        dtype=torch.complex128)
-    >>> torch.max(torch.abs(z - torch.eye(4, dtype=torch.cdouble))) # Max non-zero
-    tensor(7.5107e-16, dtype=torch.float64)
-""")
-
 det = _add_docstr(_linalg.linalg_det, r"""
 linalg.det(input) -> Tensor
 
diff --git a/torch/overrides.py b/torch/overrides.py
index e2396c806ccf..6c193b273344 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -465,7 +465,6 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
                               cudnn_enabled: -1),
         torch.int_repr: lambda input: -1,
         torch.inverse: lambda input, out=None: -1,
-        torch.linalg.inv: lambda input, out=None: -1,
         torch.is_complex: lambda input: -1,
         torch.is_distributed: lambda input: -1,
         torch.is_floating_point: lambda input: -1,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 5ad0f55bc7af..d9be13730598 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -311,31 +311,6 @@ def sample_inputs_xlogy(self, device, dtype, requires_grad):
                                      low=0, high=None,
                                      requires_grad=requires_grad))),)
 
-def sample_inputs_linalg_inv(op_info, device, dtype, requires_grad=False):
-    """
-    This function generates always invertible input for torch.linalg.inv using
-    random_fullrank_matrix_distinct_singular_value.
-    The input is generated as the itertools.product of 'batches' and 'ns'.
-    In total this function generates 8 SampleInputs
-    'batches' cases include:
-        () - single input,
-        (0,) - zero batched dimension,
-        (2,) - batch of two matrices,
-        (2, 3) - 2x3 batch of matrices
-    'ns' gives 0x0 and 5x5 matrices.
-    Zeros in dimensions are edge cases in the implementation and important to test for in order to avoid unexpected crashes.
-    """
-    from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value
-
-    batches = [(), (0, ), (2, ), (2, 3)]
-    ns = [0, 5]
-    out = []
-    for batch, n in product(batches, ns):
-        a = random_fullrank_matrix_distinct_singular_value(n, *batch, dtype=dtype).to(device)
-        a.requires_grad = requires_grad
-        out.append(SampleInput(a))
-    return out
-
 def np_sinc_with_fp16_as_fp32(x):
     # Wraps numpy's sinc function so that fp16 values are promoted to fp32
     # before sinc is invoked. Context: numpy's sinc returns NaN when evaluated
@@ -1040,14 +1015,6 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                                 dtypes=[torch.bfloat16])),
                    promotes_integers_to_float=True,
                    handles_complex_extremals=False),
-    OpInfo('linalg.inv',
-           aten_name='linalg_inv',
-           op=torch.linalg.inv,
-           dtypes=floating_and_complex_types(),
-           test_inplace_grad=False,
-           supports_tensor_out=True,
-           sample_inputs_func=sample_inputs_linalg_inv,
-           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack]),
     UnaryUfuncInfo('angle',
                    ref=np.angle,
                    dtypes=all_types_and_complex_and(torch.bool),

From 9552cc65d450bc43851df7f45f06190b3b419968 Mon Sep 17 00:00:00 2001
From: Himangshu <hlahkar@gmail.com>
Date: Wed, 23 Dec 2020 15:40:00 -0800
Subject: [PATCH 40/45] Creation of test framework for Sparse Operators
 (#48488)

Summary:
Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48488

Reviewed By: ngimel

Differential Revision: D25696487

Pulled By: mruberry

fbshipit-source-id: dc4f57c6628f62b74dd321f3f6b0fff86f25b040
---
 test/test_sparse.py                           | 31 +++++++++++++++++++
 .../_internal/common_methods_invocations.py   |  8 ++++-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/test/test_sparse.py b/test/test_sparse.py
index 80d54b3caba0..6daf3f1931d2 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -16,6 +16,10 @@
 from numbers import Number
 from torch.autograd.gradcheck import gradcheck
 from typing import Dict, Any
+from torch.testing._internal.common_device_type import \
+    (instantiate_device_type_tests, ops)
+from torch.testing._internal.common_methods_invocations import \
+    (sparse_unary_ufuncs)
 
 if TEST_SCIPY:
     import scipy.sparse
@@ -3233,6 +3237,33 @@ def test_cuda_sparse_cpu_dense_add(self):
         with self.assertRaisesRegex(RuntimeError, "add: expected 'self' to be a CUDA tensor, but got a CPU tensor"):
             x + sparse_y
 
+class TestSparseUnaryUfuncs(TestCase):
+    exact_dtype = True
+
+    @ops(sparse_unary_ufuncs)
+    def test_sparse_consistency(self, device, dtype, op):
+        unsupportedTypes = [torch.bfloat16, torch.cfloat, torch.cdouble]
+        if dtype in unsupportedTypes:
+            self.skipTest('Skipped! Unsupported dtypes for Sparse')
+
+        samples = op.sample_inputs(device, dtype)
+
+        if len(samples) == 0:
+            self.skipTest("Skipped! No sample inputs!")
+
+        sample = samples[0]
+
+        if len(sample.input) > 1:
+            self.skipTest("Skipped! Testing unary ops, one input is expected")
+        sample = sample.input[0]
+
+        expected = op(sample)
+        assert torch.is_tensor(expected)
+        output = op(sample.to_sparse())
+        assert torch.is_tensor(output)
+        self.assertEqual(output.to_dense(), expected)
+
+instantiate_device_type_tests(TestSparseUnaryUfuncs, globals())
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index d9be13730598..808506dc6809 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -108,6 +108,7 @@ def __init__(self,
                  promotes_integers_to_float=False,  # whether op promotes unary output to float or not
                  sample_inputs_func=None,  # function to generate sample inputs
                  aten_name=None,  # name of the corresponding aten:: operator
+                 supports_sparse=False  # supported for sparse
                  ):
 
         # Validates the dtypes are generated from the dispatch-related functions
@@ -146,6 +147,7 @@ def __init__(self,
             self.autodiff_nonfusible_nodes = ['aten::' + self.name]
         else:
             self.autodiff_nonfusible_nodes = autodiff_nonfusible_nodes
+        self.supports_sparse = supports_sparse
 
 
 
@@ -257,6 +259,7 @@ def __init__(self,
                  handles_complex_extremals=True,  # whether the op correct handles complex extremals (like inf -infj)
                  supports_complex_to_float=False,  # op supports casting from complex input to real output safely eg. angle
                  sample_inputs_func=sample_inputs_unary,
+                 supports_sparse=False,
                  **kwargs):
         super(UnaryUfuncInfo, self).__init__(name,
                                              dtypes=dtypes,
@@ -264,6 +267,7 @@ def __init__(self,
                                              dtypesIfCUDA=dtypesIfCUDA,
                                              dtypesIfROCM=dtypesIfROCM,
                                              sample_inputs_func=sample_inputs_func,
+                                             supports_sparse=supports_sparse,
                                              **kwargs)
         self.ref = ref
         self.domain = domain
@@ -613,6 +617,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
     UnaryUfuncInfo('asin',
                    ref=np.arcsin,
                    domain=(-1, 1),
+                   supports_sparse=True,
                    decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
                    promotes_integers_to_float=True,
                    dtypes=all_types_and_complex_and(torch.bool),
@@ -625,7 +630,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                                 device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
                                 device_type='cuda', dtypes=[torch.cfloat, torch.cdouble],
-                                active_if=IS_WINDOWS),
+                                active_if=IS_WINDOWS)
                    )),
     # NOTE: derivative for inplace asinh is not implemented
     UnaryUfuncInfo('asinh',
@@ -1141,6 +1146,7 @@ def reference_sigmoid(x):
 # Common operator groupings
 unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo)]
 spectral_funcs = [op for op in op_db if isinstance(op, SpectralFuncInfo)]
+sparse_unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo) and op.supports_sparse is True]
 
 def index_variable(shape, max_indices):
     if not isinstance(shape, tuple):

From 69b1373587f57cd79921a16c849c13f2c8bd3f39 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Wed, 23 Dec 2020 17:46:03 -0800
Subject: [PATCH 41/45] Revert D25692616: [pytorch][PR] [reland] Early
 terminate when CUDA assert were thrown

Test Plan: revert-hammer

Differential Revision:
D25692616 (https://github.com/pytorch/pytorch/commit/e6a215592ea5b7f7f7e59e89116b507089bfb8d0)

Original commit changeset: 9c5352220d63

fbshipit-source-id: dade8068cad265d15ee908d98abe0de5b81a195d
---
 test/test_testing.py                          | 52 +------------------
 torch/testing/_internal/common_device_type.py | 14 -----
 2 files changed, 2 insertions(+), 64 deletions(-)

diff --git a/test/test_testing.py b/test/test_testing.py
index d3bde3289f5e..b87345186cb3 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -3,9 +3,9 @@
 import math
 
 from torch.testing._internal.common_utils import \
-    (TestCase, make_tensor, run_tests, slowTest)
+    (TestCase, run_tests, make_tensor)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, onlyCUDA, onlyOnCPUAndCUDA, dtypes)
+    (instantiate_device_type_tests, onlyOnCPUAndCUDA, dtypes)
 
 # For testing TestCase methods and torch.testing functions
 class TestTesting(TestCase):
@@ -438,54 +438,6 @@ def test_assert_messages(self, device):
         self.assertEqual("no_user_msg", self._get_assert_msg(msg=None, debug_msg="no_user_msg"))
         self.assertEqual("debug_msg\nuser_msg", self._get_assert_msg(msg="user_msg", debug_msg="debug_msg"))
 
-    @onlyCUDA
-    @slowTest
-    def test_cuda_assert_should_stop_test_suite(self, device):
-        # This test is slow because it spawn another process to run another test suite.
-        import subprocess
-        import sys
-
-        problematic_test_script = """\
-#!/usr/bin/env python
-
-import torch
-
-from torch.testing._internal.common_utils import (TestCase, run_tests)
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
-
-# This test is added to ensure that test suite terminates early when
-# CUDA assert was thrown since all subsequent test will fail.
-# See: https://github.com/pytorch/pytorch/issues/49019
-# This test file should be invoked from test_testing.py
-class TestThatContainsCUDAAssertFailure(TestCase):
-
-    def test_throw_unrecoverable_cuda_exception(self, device):
-        x = torch.rand(10, device=device)
-        # cause unrecoverable CUDA exception, recoverable on CPU
-        y = x[torch.tensor([25])].cpu()
-
-    def test_trivial_passing_test_case_on_cpu_cuda(self, device):
-        x1 = torch.tensor([0., 1.], device=device)
-        x2 = torch.tensor([0., 1.], device='cpu')
-        self.assertEqual(x1, x2)
-
-instantiate_device_type_tests(
-    TestThatContainsCUDAAssertFailure,
-    globals(),
-    except_for=None
-)
-
-if __name__ == '__main__':
-    run_tests()
-"""
-
-        # Test running of cuda assert test suite should early terminate.
-        p = subprocess.run([sys.executable, '-c', problematic_test_script], stderr=subprocess.PIPE, timeout=120)
-        # should capture CUDA error
-        self.assertIn('CUDA error: device-side assert triggered', p.stderr.decode('ascii'))
-        # should run only 3 tests - 2 CPUs and 1 CUDA (remaining CUDA test should skip)
-        self.assertIn('Ran 3 tests', p.stderr.decode('ascii'))
-
 instantiate_device_type_tests(TestTesting, globals())
 
 if __name__ == '__main__':
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 73185116a4f5..36f02eff0c0f 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -187,9 +187,6 @@ def _construct_test_name(test_name, op, device_type, dtype):
 class DeviceTypeTestBase(TestCase):
     device_type: str = 'generic_device_type'
 
-    # Flag to disable test suite early due to unrecoverable error such as CUDA error.
-    _stop_test_suite = False
-
     # Precision is a thread-local setting since it may be overridden per test
     _tls = threading.local()
     _tls.precision = TestCase._precision
@@ -274,11 +271,6 @@ def instantiated_test(self, name=name, test=test_fn, dtype=dtype, op=op):
                     self.precision = self._get_precision_override(test_fn, dtype)
                     args = (arg for arg in (device_arg, dtype, op) if arg is not None)
                     result = test_fn(self, *args)
-                except RuntimeError as rte:
-                    if 'CUDA error: device-side assert triggered' in rte.__repr__():
-                        self._stop_test_suite = True
-                    # raise the runtime error as is.
-                    raise rte
                 finally:
                     self.precision = guard_precision
 
@@ -321,12 +313,6 @@ def instantiated_test(self, name=name, test=test_fn, dtype=dtype, op=op):
             for dtype in dtypes:
                 instantiate_test_helper(cls, name, test=test, dtype=dtype, op=None)
 
-    def run(self, result=None):
-        super().run(result=result)
-        # Early terminate test if _stop_test_suite is set.
-        if self._stop_test_suite:
-            result.stop()
-
 
 class CPUTestBase(DeviceTypeTestBase):
     device_type = 'cpu'

From 89b4899ea5363fd69872c0cabf0dedea2dc533c8 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 23 Dec 2020 22:34:54 -0800
Subject: [PATCH 42/45] [quant][graphmode][fx] Standalone module support
 {input/output}_quantized_idxs (#49754)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49754

This PR adds the support for {input/output}_quantized_idxs for standalone module.

if input_quantized_idxs = [] and output_quantized_idxs = [], the standalone module will be expecting float
input and produce float output, and will quantize the input and dequantize output internally

if input_quantized_idxs = [0] and otuput_qiuantized_idxs = [0], the standalone module will be expecting quantized
input and produce quantized output, the input will be quantized in the parent module, and output will be dequantized
in the parent module as well, this is similar to current quantized modules like nn.quantized.Conv2d

For more details, please see the test case

Test Plan:
python test/test_quantization.py TestQuantizeFx.test_standalone_module

Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D25684692

fbshipit-source-id: 900360e01c0e35b26fe85f4a887dc1fd6f7bfb66
---
 test/quantization/test_quantize_fx.py         | 126 +++++++++++++----
 torch/quantization/fx/observed_module.py      |  10 +-
 .../quantization/fx/quantization_patterns.py  |   4 +-
 torch/quantization/fx/quantize.py             | 132 +++++++++++++-----
 torch/quantization/quantize_fx.py             |  23 ++-
 5 files changed, 221 insertions(+), 74 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 66324f928f04..0aba50779432 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -570,7 +570,16 @@ def forward(self, x):
         m = convert_fx(m)
         m(tensor_input)
 
-    def test_standalone_module(self):
+    def _test_standalone_module(
+            self,
+            interface_config,
+            prepare_count_check,
+            standalone_prepare_count_check,
+            convert_count_check,
+            standalone_convert_count_check):
+        """ Test standalone module with different quantized input/quantized output
+        configurations
+        """
         class StandaloneModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -610,45 +619,32 @@ def forward(self, x):
         original_ref_m.conv2.weight = torch.nn.Parameter(original_m.standalone.conv.weight.detach())
         original_ref_m.conv2.bias = torch.nn.Parameter(original_m.standalone.conv.bias.detach())
 
-        qconfig_dict = {"": default_qconfig}
-        config_name = {"standalone_module_name": [("standalone", None, None)]}
-        config_class = {"standalone_module_class": [(StandaloneModule, None, None)]}
-        for prepare_config in [config_name, config_class]:
+        for is_name in [True, False]:
+            if is_name:
+                prepare_config = {
+                    "standalone_module_name": [("standalone", None, interface_config)]
+                }
+            else:
+                prepare_config = {
+                    "standalone_module_class": [(StandaloneModule, None, interface_config)]
+                }
+
             original_m_copy = copy.deepcopy(original_m)
             original_ref_m_copy = copy.deepcopy(original_ref_m)
+
+            qconfig_dict = {"": default_qconfig}
             # check prepared model
             m = prepare_fx(
                 original_m_copy, qconfig_dict, prepare_custom_config_dict=prepare_config)
             # calibration
             m(data)
-            # input and output of first conv, observer for standalone module
-            # will be inserted in the standalone module itself
-            count_check = {
-                ns.call_module(torch.quantization.MinMaxObserver): 2
-            }
-            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
-            # for input and output of conv in the standalone module
-            count_check = {
-                ns.call_module(torch.quantization.MinMaxObserver): 2
-            }
-            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
+            self.checkGraphModuleNodes(m, expected_node_occurrence=prepare_count_check)
+            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=standalone_prepare_count_check)
 
             # check converted/quantized model
             m = convert_fx(m)
-            count_check = {
-                ns.call_function(torch.quantize_per_tensor) : 1,
-                ns.call_module(nnq.Conv2d) : 1,
-                ns.call_method('dequantize') : 1,
-            }
-            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
-            count_check = {
-                # standalone module will take float as input and output
-                # so we'll see quantize and dequantize in the modoule
-                ns.call_function(torch.quantize_per_tensor) : 1,
-                ns.call_module(nnq.Conv2d): 1,
-                ns.call_method('dequantize') : 1,
-            }
-            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
+            self.checkGraphModuleNodes(m, expected_node_occurrence=convert_count_check)
+            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=standalone_convert_count_check)
             res = m(data)
 
             # quantize the reference model
@@ -658,6 +654,76 @@ def forward(self, x):
             ref_res = ref_m(data)
             self.assertEqual(res, ref_res)
 
+    def test_standalone_module_float_interface(self):
+        float_interface_config = {
+            "input_quantized_idxs": [],  # float input
+            "output_quantized_idxs": [],  # float output
+        }
+        interface_config = float_interface_config
+        # input and output of first conv, observer for standalone module
+        # will be inserted in the standalone module itself
+        prepare_count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 2
+        }
+        # for input and output of conv in the standalone module
+        standalone_prepare_count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 2
+        }
+        convert_count_check = {
+            ns.call_function(torch.quantize_per_tensor) : 1,
+            ns.call_module(nnq.Conv2d) : 1,
+            ns.call_method("dequantize") : 1,
+        }
+        standalone_convert_count_check = {
+            # standalone module will take float as input and output
+            # so we'll see quantize and dequantize in the modoule
+            ns.call_function(torch.quantize_per_tensor) : 1,
+            ns.call_module(nnq.Conv2d): 1,
+            ns.call_method("dequantize") : 1,
+        }
+        self._test_standalone_module(
+            interface_config,
+            prepare_count_check,
+            standalone_prepare_count_check,
+            convert_count_check,
+            standalone_convert_count_check)
+
+    def test_standalone_module_quantized_interface(self):
+        quantized_interface_config = {
+            "input_quantized_idxs": [0],  # quantized input
+            "output_quantized_idxs": [0],  # quantized output
+        }
+        interface_config = quantized_interface_config
+        # observer for input and output of first conv
+        prepare_count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 2
+        }
+        # for output of conv in the standalone module
+        standalone_prepare_count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 1
+        }
+        convert_count_check = {
+            # quantizing input for conv
+            ns.call_function(torch.quantize_per_tensor) : 1,
+            ns.call_module(nnq.Conv2d) : 1,
+            # dequantizing output of standalone module
+            ns.call_method("dequantize") : 1,
+        }
+        standalone_convert_count_check = {
+            # quantization of input happens in parent module
+            # quantization of output happens in the quantized conv module
+            ns.call_function(torch.quantize_per_tensor) : 0,
+            ns.call_module(nnq.Conv2d): 1,
+            # dequantization for output happens in parent module
+            ns.call_method("dequantize") : 0,
+        }
+        self._test_standalone_module(
+            interface_config,
+            prepare_count_check,
+            standalone_prepare_count_check,
+            convert_count_check,
+            standalone_convert_count_check)
+
     @skipIfNoFBGEMM
     def test_qconfig_none(self):
         class M(torch.nn.Module):
diff --git a/torch/quantization/fx/observed_module.py b/torch/quantization/fx/observed_module.py
index a95bc184fa10..808a3b36fb4a 100644
--- a/torch/quantization/fx/observed_module.py
+++ b/torch/quantization/fx/observed_module.py
@@ -2,11 +2,11 @@
 import copy
 from torch.fx import GraphModule  # type: ignore
 from torch.fx.graph import Graph
-from typing import Union, Dict, Any
+from typing import Union, Dict, Any, List
 
 class ObservedGraphModule(GraphModule):
 
-    def get_preserved_attr_names(self):
+    def get_preserved_attr_names(self) -> List[str]:
         return ['_activation_post_process_map',
                 '_patterns',
                 '_qconfig_map',
@@ -35,6 +35,12 @@ def is_observed_module(module: Any) -> bool:
     return isinstance(module, ObservedGraphModule)
 
 class ObservedStandaloneGraphModule(ObservedGraphModule):
+    def get_preserved_attr_names(self) -> List[str] :
+        return super().get_preserved_attr_names() + [
+            "_standalone_module_input_quantized_idxs",
+            "_standalone_module_output_quantized_idxs"
+        ]
+
     def __deepcopy__(self, memo):
         fake_mod = torch.nn.Module()
         fake_mod.__dict__ = copy.deepcopy(self.__dict__)
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index a1e601332d4a..ed2f7e35659c 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -753,10 +753,10 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
         qconfig = quantizer.qconfig_map[node.name]
         convert = torch.quantization.quantize_fx._convert_standalone_module_fx  # type: ignore
         observed_standalone_module = quantizer.modules[node.target]
+        input_quantized_idxs = observed_standalone_module._standalone_module_input_quantized_idxs
         quantized_standalone_module = convert(observed_standalone_module, debug=debug)
         parent_name, name = _parent_name(node.target)
         # update the modules dict
         setattr(quantizer.modules[parent_name], name, quantized_standalone_module)
         quantizer.modules[node.target] = quantized_standalone_module
-        # standalone module takes float input
-        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=False))
+        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=input_quantized_idxs))
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index af9496a66a63..d821f9610b7f 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -102,14 +102,15 @@ def insert_observer(
         'call_module', observer_name, (load_arg(node),), {})
     observed_node_names_set.add(node.name)
 
-def insert_observer_for_special_module(
+def maybe_insert_observer_for_special_module(
         quantize_handler: QuantizeHandler, modules: Dict[str, torch.nn.Module],
-        prepare_custom_config_dict: Any, qconfig: Any, node: Node):
+        prepare_custom_config_dict: Any, qconfig: Any, node: Node) -> Optional[List[int]]:
     """ Insert observer for custom module and standalone module
       Returns: standalone_module_input_idxs: the indexs for inputs that
       needs to be observed by parent module
     """
     assert modules is not None
+    standalone_module_input_idxs = None
     if isinstance(quantize_handler, CustomModuleQuantizeHandler):
         custom_module = modules[node.target]  # type: ignore
         custom_module_class_mapping = prepare_custom_config_dict.get(
@@ -129,19 +130,22 @@ def insert_observer_for_special_module(
         class_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_class_configs}
         name_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_name_configs}
         config = class_config_map.get(type(standalone_module), (None, None))
-        config = name_config_map.get(node.target, (None, None))
-        standalone_module_qconfig_dict = {"": qconfig} if config[0] is None else config[0]
-        standalone_prepare_config_dict = {} if config[1] is None else config[1]
+        config = name_config_map.get(node.target, config)
+        sm_qconfig_dict = {"": qconfig} if config[0] is None else config[0]
+        sm_prepare_config_dict = {} if config[1] is None else config[1]
         prepare = \
             torch.quantization.quantize_fx._prepare_standalone_module_fx  # type: ignore
         observed_standalone_module = \
-            prepare(standalone_module, standalone_module_qconfig_dict, standalone_prepare_config_dict)
+            prepare(standalone_module, sm_qconfig_dict, sm_prepare_config_dict)
+        standalone_module_input_idxs = observed_standalone_module.\
+            _standalone_module_input_quantized_idxs
         observed_standalone_module = mark_observed_standalone_module(
             observed_standalone_module)
         parent_name, name = _parent_name(node.target)
         setattr(modules[parent_name], name,
                 observed_standalone_module)
         modules[node.target] = observed_standalone_module  # type: ignore
+    return standalone_module_input_idxs
 
 def insert_observer_for_output_of_the_node(
         node: Node,
@@ -155,7 +159,8 @@ def insert_observer_for_output_of_the_node(
         observed_graph: Graph,
         load_arg: Callable,
         observed_node_names_set: Set[str],
-        matched_nodes: Optional[List[Node]]):
+        matched_nodes: Optional[List[Node]],
+        standalone_module_input_idxs: Optional[List[int]]):
     """ Insert observer/fake_quantize module for output of the observed
     module if needed
     """
@@ -215,8 +220,11 @@ def input_is_observed(arg):
                 observed_node_names_set.add(node.name)
         elif isinstance(quantize_handler,
                         StandaloneModuleQuantizeHandler):
-            # output is observed in the standalone module
-            return
+            assert node.op == "call_module"
+            output_is_quantized = 0 in \
+                modules[node.target]._standalone_module_output_quantized_idxs  # type: ignore
+            if output_is_quantized:
+                observed_node_names_set.add(node.name)
         elif (quantize_handler.all_node_args and
               input_output_observed(quantize_handler)):
             # observer for outputs
@@ -226,6 +234,16 @@ def input_is_observed(arg):
                 activation_post_process_map, env, observed_graph,
                 load_arg, observed_node_names_set)
 
+        # insert observer for input of standalone module
+        if standalone_module_input_idxs is not None:
+            for idx in standalone_module_input_idxs:
+                if node.args[idx].name not in observed_node_names_set:  # type: ignore
+                    new_observer = qconfig.activation()
+                    insert_observer(
+                        node, new_observer, model,
+                        activation_post_process_map, env, observed_graph,
+                        load_arg, observed_node_names_set)
+
 def insert_observer_for_input_arg_of_observed_node(
         node: Node, observed_node_names_set: Set[str],
         quants: Dict[str, Tuple[DefaultQuantizeHandler, Callable]],
@@ -373,10 +391,19 @@ def _prepare(self, model: GraphModule, qconfig_dict: Any,
         """ standalone_module means it a submodule that is not inlined in
         parent module, and will be quantized separately as one unit.
 
-        When we are preparing a standalone module:
-        both input and output are observed in prepared standalone module
+        How the standalone module is observed is specified by `input_quantized_idxs` and
+        `output_quantized_idxs` in the prepare_custom_config for the standalone module
         Returns:
             model(GraphModule): prepared standalone module
+            attributes:
+                _standalone_module_input_quantized_idxs(List[Int]): a list of
+                    indexes for the graph input that is expected to be quantized,
+                    same as input_quantized_idxs configuration provided
+                    for the standalone module
+                _standalone_module_output_quantized_idxs(List[Int]): a list of
+                    indexs for the graph output that is quantized
+                    same as input_quantized_idxs configuration provided
+                    for the standalone module
         """
         if prepare_custom_config_dict is None:
             prepare_custom_config_dict = {}
@@ -430,8 +457,6 @@ def _prepare(self, model: GraphModule, qconfig_dict: Any,
         def load_arg(a):
             return map_arg(a, lambda node: env[node.name])
 
-        # indexes for the inputs that needs to be observed
-        standalone_module_observed_input_idxs: List[int] = []
         graph_inputs = []
         for node in model.graph.nodes:
             if node.op == 'placeholder':
@@ -487,14 +512,15 @@ def load_arg(a):
                 # parent
                 if qconfig is not None:
                     assert obj is not None
-                    insert_observer_for_special_module(
-                        obj, self.modules, prepare_custom_config_dict, qconfig,
-                        node)
+                    standalone_module_input_idxs = \
+                        maybe_insert_observer_for_special_module(
+                            obj, self.modules, prepare_custom_config_dict, qconfig,
+                            node)
                     insert_observer_for_output_of_the_node(
                         node, obj, qconfig, self.modules, model, pattern,
                         self.activation_post_process_map, env,
                         observed_graph, load_arg, observed_node_names_set,
-                        matched_nodes)
+                        matched_nodes, standalone_module_input_idxs)
             else:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
 
@@ -516,6 +542,19 @@ def load_arg(a):
         model = GraphModule(model, observed_graph)
         self.save_state(model)
         model = mark_observed_module(model)
+        if is_standalone_module:
+            assert result_node is not None
+            assert isinstance(result_node.args[0], Node), \
+                "standalone module only supports returning simple value currently"\
+                "(not tuple, dict etc.)"
+            # indicator for whether output is observed or not.
+            # This used for correctly quantize standalone modules
+            output_is_observed = \
+                result_node.args[0].name in observed_node_names_set
+            # these inputs are observed in parent
+            model._standalone_module_input_quantized_idxs = \
+                input_quantized_idxs
+            model._standalone_module_output_quantized_idxs = output_quantized_idxs
         return model
 
     def save_state(self, observed: GraphModule) -> None:
@@ -569,8 +608,10 @@ def _convert(self, model: GraphModule, debug: bool = False,
         """ standalone_module means it a submodule that is not inlined in
         parent module, and will be quantized separately as one unit.
 
-        Returns a quantized standalone module which accepts float input
-        and produces float output.
+        Returns a quantized standalone module, whether input/output is quantized is
+        specified by prepare_custom_config_dict, with
+        input_quantized_idxs, output_quantized_idxs, please
+        see docs for prepare_fx for details
         """
         if convert_custom_config_dict is None:
             convert_custom_config_dict = {}
@@ -627,36 +668,50 @@ def load_x(n: Node) -> Node:
             else:
                 return env[n.name]
 
-        def load_arg(quantized: Optional[Union[List[Any], bool, Tuple[Any, ...]]]
+        def load_arg(quantized: Optional[Union[List[int], bool, Tuple[int, ...]]]
                      ) -> Callable[[Node], Argument]:
             """
             Input: quantized, which can be None, list, boolean or tuple
-              - if quantized is a list or tuple, then arg should be a list and
-                the args with corresponding indexes will be quantized
-              - if quantized is a boolean, then all args will be
-                quantized/not quantized
               - if quantized is None, then we'll load the node as long as it
                 exists
+              - if quantized is a boolean, then all args will be
+                quantized/not quantized
+              - if quantized is an empty list or tuple, then it is the same as load_arg(quantized=False)
+              - if quantized is a list or tuple, then arg should be a list and
+                the args with corresponding indexes will be quantized
 
             Output: fn which takes arg_or_args, and loads them from the
                 corresponding environment depending on the value of quantized.
             """
             assert quantized is None or \
                 isinstance(quantized, (tuple, list, bool)), type(quantized)
+            if isinstance(quantized, (tuple, list)) and len(quantized) == 0:
+                # empty tuple or list means nothing is quantized
+                quantized = False
 
             def load_arg_impl(arg_or_args):
-                if quantized is None:
+                # we'll update the format of `quantized`
+                # to better match arg_or_args
+                updated_quantized: Optional[Union[List[int], bool, Tuple[int, ...]]] = quantized
+
+                if isinstance(quantized, (tuple, list)) and \
+                   len(quantized) == 1 and isinstance(arg_or_args, Node):
+                    # when argument is one Node instead of tuple, we just need to check
+                    # 0 is in the quantized list
+                    updated_quantized = 0 in quantized
+
+                if updated_quantized is None:
                     return map_arg(arg_or_args, load_x)
-                if isinstance(quantized, bool):
+                if isinstance(updated_quantized, bool):
                     return map_arg(
                         arg_or_args,
-                        load_quantized if quantized else load_non_quantized)
-                elif isinstance(quantized, (tuple, list)):
+                        load_quantized if updated_quantized else load_non_quantized)
+                elif isinstance(updated_quantized, (tuple, list)):
                     assert isinstance(arg_or_args, (tuple, list)), arg_or_args
                     loaded_args = []
                     # for now, we only support quantizing positional arguments
                     for i, a in enumerate(arg_or_args):
-                        if i in quantized:
+                        if i in updated_quantized:
                             loaded_args.append(map_arg(a, load_quantized))
                         else:
                             loaded_args.append(map_arg(a, load_non_quantized))
@@ -690,10 +745,10 @@ def node_arg_is_quantized(node_arg: Any) -> bool:
         def is_output_quantized(node: Node, obj: QuantizeHandler) -> bool:
             """ Check if output node is quantized or not """
             assert self.modules is not None
-            # by default the output is expected to be quantized
+            # by default the output for a quantizable node is expected to be quantized
             quantized = True
 
-            # Need to get correct quantized/non-quantized state for the output
+            # Need to get correct quantized/non-quantized state forn the output
             # of CopyNode
             if type(obj) in [
                     CopyNode,
@@ -750,7 +805,7 @@ def insert_quantize_node(node: Node) -> None:
             "output_quantized_idxs", [])
 
         for node in model.graph.nodes:
-            if node.op == 'output':
+            if node.op == "output":
                 cur_output_node_idx = output_node_seen_cnt
                 output_node_seen_cnt += 1
                 if cur_output_node_idx in output_quantized_idxs:
@@ -775,12 +830,19 @@ def insert_quantize_node(node: Node) -> None:
                     quantized = False
                 else:
                     assert obj is not None
+                    # We will get whether the output is quantized or not before
+                    # convert for standalone module and after convert
+                    # for non-standalone module, since _standalone_module_output_quantized_idxs
+                    # is only available in observed standalone module
+                    if is_observed_standalone_module_node:
+                        out_quant_idxs = self.modules[node.target]._standalone_module_output_quantized_idxs
+                        assert len(out_quant_idxs) <= 1, "Currently standalone only support one output"
+                        quantized = 0 in out_quant_idxs
+
                     result = obj.convert(
                         self, node, load_arg, debug=debug,
                         convert_custom_config_dict=convert_custom_config_dict)
-                    if is_observed_standalone_module_node:
-                        quantized = False
-                    else:
+                    if not is_observed_standalone_module_node:
                         quantized = is_output_quantized(node, obj)
 
                 if quantized:
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index cba104b8f783..89ba877ffe78 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -107,8 +107,20 @@ def _prepare_standalone_module_fx(
     standalone_module means it a submodule that is not inlined in parent module,
         and will be quantized separately as one unit.
 
-    Both input and output of the module are observed in the
-    standalone module.
+    How the standalone module is observed is specified by `input_quantized_idxs` and
+    `output_quantized_idxs` in the prepare_custom_config for the standalone module
+
+    Returns:
+        model(GraphModule): prepared standalone module
+        attributes:
+            _standalone_module_input_quantized_idxs(List[Int]): a list of
+                indexes for the graph input that is expected to be quantized,
+                same as input_quantized_idxs configuration provided
+                for the standalone module
+            _standalone_module_output_quantized_idxs(List[Int]): a list of
+                indexs for the graph output that is quantized
+                same as input_quantized_idxs configuration provided
+                for the standalone module
     """
     return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, is_standalone_module=True)
 
@@ -378,8 +390,9 @@ def _convert_standalone_module_fx(
     r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx`
     and convert it to a quantized model
 
-    Return:
-        A quantized standalone module which accepts float input
-        and produces float output.
+    Returns a quantized standalone module, whether input/output is quantized is
+    specified by prepare_custom_config_dict, with
+    input_quantized_idxs, output_quantized_idxs, please
+    see docs for prepare_fx for details
     """
     return _convert_fx(graph_module, debug, convert_custom_config_dict, is_standalone_module=True)

From ec6de6a697668e594a3f1d49e9a87a7c94b6164b Mon Sep 17 00:00:00 2001
From: Summer Deng <summerdeng@fb.com>
Date: Thu, 24 Dec 2020 03:47:26 -0800
Subject: [PATCH 43/45] Clip small scales to fp16 min

Summary: When the FC output min max range is very small, we want to enforce a cutoff on the scale parameter to better generalize for future values that could fall beyond the original range.

Test Plan:
More analysis about the output distributions can be found in N425166

An example workflow using fp16 min clipping is f240972205

Reviewed By: jspark1105

Differential Revision: D25681249

fbshipit-source-id: c4dfbd3ee823886afed06e6c2eccfc29d612f7e6
---
 caffe2/quantization/server/norm_minimization.cc | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/caffe2/quantization/server/norm_minimization.cc b/caffe2/quantization/server/norm_minimization.cc
index 94e655e56da2..a8d0d3da0dbe 100644
--- a/caffe2/quantization/server/norm_minimization.cc
+++ b/caffe2/quantization/server/norm_minimization.cc
@@ -14,6 +14,10 @@ namespace dnnlowp {
 
 #undef NDEBUG
 
+// Use fp16_min as the small scale cutoff because we don't want to use scales in fp16 subnormal range.
+// This is to be consistent with Glow and FakeLowP implementation for NNPI.
+constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f;
+
 static float
 GetNorm(float begin, float end, float density, NormMinimization::Kind kind) {
   float norm = 0;
@@ -57,7 +61,8 @@ TensorQuantizationParams NormMinimization::NonlinearQuantizationParamsSearch(
   vector<float> bins_f(dnnlowp::adjust_hist_to_include_zero(hist, &min, &max));
   int nbins = bins_f.size();
   float bin_width = (max - min) / nbins;
-  if (bin_width == 0) {
+  float scale = (max - min) / float((1 << precision) - 1);
+  if (bin_width == 0 || scale < SMALL_SCALE_THRESHOLD) {
     QuantizationFactory* qfactory = QuantizationFactory::GetDefaultInstance();
     return qfactory->ChooseQuantizationParams(
         min, max, precision, preserve_sparsity);
@@ -190,6 +195,12 @@ TensorQuantizationParams NormMinimization::ChooseQuantizationParams(
   int nbins = bins_f.size();
   float bin_width = (max - min) / nbins;
 
+  float scale = (max - min) / float((1 << precision) - 1);
+  if (bin_width == 0 || scale < SMALL_SCALE_THRESHOLD) {
+    QuantizationFactory* qfactory = QuantizationFactory::GetDefaultInstance();
+    return qfactory->ChooseQuantizationParams(
+        min, max, precision, preserve_sparsity);
+  }
   int dst_nbins = 1 << precision;
 
   int zero_bin = round(-min / bin_width);

From 46cf6d332f075ed90d3baf21c32de51e4f304549 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Thu, 24 Dec 2020 15:49:01 -0800
Subject: [PATCH 44/45] Revert D25684692: [quant][graphmode][fx] Standalone
 module support {input/output}_quantized_idxs

Test Plan: revert-hammer

Differential Revision:
D25684692 (https://github.com/pytorch/pytorch/commit/89b4899ea5363fd69872c0cabf0dedea2dc533c8)

Original commit changeset: 900360e01c0e

fbshipit-source-id: 8b65fa8fbc7b364fbddb5f23cc696cd9b7db98cd
---
 test/quantization/test_quantize_fx.py         | 126 ++++-------------
 torch/quantization/fx/observed_module.py      |  10 +-
 .../quantization/fx/quantization_patterns.py  |   4 +-
 torch/quantization/fx/quantize.py             | 132 +++++-------------
 torch/quantization/quantize_fx.py             |  23 +--
 5 files changed, 74 insertions(+), 221 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 0aba50779432..66324f928f04 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -570,16 +570,7 @@ def forward(self, x):
         m = convert_fx(m)
         m(tensor_input)
 
-    def _test_standalone_module(
-            self,
-            interface_config,
-            prepare_count_check,
-            standalone_prepare_count_check,
-            convert_count_check,
-            standalone_convert_count_check):
-        """ Test standalone module with different quantized input/quantized output
-        configurations
-        """
+    def test_standalone_module(self):
         class StandaloneModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -619,32 +610,45 @@ def forward(self, x):
         original_ref_m.conv2.weight = torch.nn.Parameter(original_m.standalone.conv.weight.detach())
         original_ref_m.conv2.bias = torch.nn.Parameter(original_m.standalone.conv.bias.detach())
 
-        for is_name in [True, False]:
-            if is_name:
-                prepare_config = {
-                    "standalone_module_name": [("standalone", None, interface_config)]
-                }
-            else:
-                prepare_config = {
-                    "standalone_module_class": [(StandaloneModule, None, interface_config)]
-                }
-
+        qconfig_dict = {"": default_qconfig}
+        config_name = {"standalone_module_name": [("standalone", None, None)]}
+        config_class = {"standalone_module_class": [(StandaloneModule, None, None)]}
+        for prepare_config in [config_name, config_class]:
             original_m_copy = copy.deepcopy(original_m)
             original_ref_m_copy = copy.deepcopy(original_ref_m)
-
-            qconfig_dict = {"": default_qconfig}
             # check prepared model
             m = prepare_fx(
                 original_m_copy, qconfig_dict, prepare_custom_config_dict=prepare_config)
             # calibration
             m(data)
-            self.checkGraphModuleNodes(m, expected_node_occurrence=prepare_count_check)
-            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=standalone_prepare_count_check)
+            # input and output of first conv, observer for standalone module
+            # will be inserted in the standalone module itself
+            count_check = {
+                ns.call_module(torch.quantization.MinMaxObserver): 2
+            }
+            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+            # for input and output of conv in the standalone module
+            count_check = {
+                ns.call_module(torch.quantization.MinMaxObserver): 2
+            }
+            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
 
             # check converted/quantized model
             m = convert_fx(m)
-            self.checkGraphModuleNodes(m, expected_node_occurrence=convert_count_check)
-            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=standalone_convert_count_check)
+            count_check = {
+                ns.call_function(torch.quantize_per_tensor) : 1,
+                ns.call_module(nnq.Conv2d) : 1,
+                ns.call_method('dequantize') : 1,
+            }
+            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+            count_check = {
+                # standalone module will take float as input and output
+                # so we'll see quantize and dequantize in the modoule
+                ns.call_function(torch.quantize_per_tensor) : 1,
+                ns.call_module(nnq.Conv2d): 1,
+                ns.call_method('dequantize') : 1,
+            }
+            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
             res = m(data)
 
             # quantize the reference model
@@ -654,76 +658,6 @@ def forward(self, x):
             ref_res = ref_m(data)
             self.assertEqual(res, ref_res)
 
-    def test_standalone_module_float_interface(self):
-        float_interface_config = {
-            "input_quantized_idxs": [],  # float input
-            "output_quantized_idxs": [],  # float output
-        }
-        interface_config = float_interface_config
-        # input and output of first conv, observer for standalone module
-        # will be inserted in the standalone module itself
-        prepare_count_check = {
-            ns.call_module(torch.quantization.MinMaxObserver): 2
-        }
-        # for input and output of conv in the standalone module
-        standalone_prepare_count_check = {
-            ns.call_module(torch.quantization.MinMaxObserver): 2
-        }
-        convert_count_check = {
-            ns.call_function(torch.quantize_per_tensor) : 1,
-            ns.call_module(nnq.Conv2d) : 1,
-            ns.call_method("dequantize") : 1,
-        }
-        standalone_convert_count_check = {
-            # standalone module will take float as input and output
-            # so we'll see quantize and dequantize in the modoule
-            ns.call_function(torch.quantize_per_tensor) : 1,
-            ns.call_module(nnq.Conv2d): 1,
-            ns.call_method("dequantize") : 1,
-        }
-        self._test_standalone_module(
-            interface_config,
-            prepare_count_check,
-            standalone_prepare_count_check,
-            convert_count_check,
-            standalone_convert_count_check)
-
-    def test_standalone_module_quantized_interface(self):
-        quantized_interface_config = {
-            "input_quantized_idxs": [0],  # quantized input
-            "output_quantized_idxs": [0],  # quantized output
-        }
-        interface_config = quantized_interface_config
-        # observer for input and output of first conv
-        prepare_count_check = {
-            ns.call_module(torch.quantization.MinMaxObserver): 2
-        }
-        # for output of conv in the standalone module
-        standalone_prepare_count_check = {
-            ns.call_module(torch.quantization.MinMaxObserver): 1
-        }
-        convert_count_check = {
-            # quantizing input for conv
-            ns.call_function(torch.quantize_per_tensor) : 1,
-            ns.call_module(nnq.Conv2d) : 1,
-            # dequantizing output of standalone module
-            ns.call_method("dequantize") : 1,
-        }
-        standalone_convert_count_check = {
-            # quantization of input happens in parent module
-            # quantization of output happens in the quantized conv module
-            ns.call_function(torch.quantize_per_tensor) : 0,
-            ns.call_module(nnq.Conv2d): 1,
-            # dequantization for output happens in parent module
-            ns.call_method("dequantize") : 0,
-        }
-        self._test_standalone_module(
-            interface_config,
-            prepare_count_check,
-            standalone_prepare_count_check,
-            convert_count_check,
-            standalone_convert_count_check)
-
     @skipIfNoFBGEMM
     def test_qconfig_none(self):
         class M(torch.nn.Module):
diff --git a/torch/quantization/fx/observed_module.py b/torch/quantization/fx/observed_module.py
index 808a3b36fb4a..a95bc184fa10 100644
--- a/torch/quantization/fx/observed_module.py
+++ b/torch/quantization/fx/observed_module.py
@@ -2,11 +2,11 @@
 import copy
 from torch.fx import GraphModule  # type: ignore
 from torch.fx.graph import Graph
-from typing import Union, Dict, Any, List
+from typing import Union, Dict, Any
 
 class ObservedGraphModule(GraphModule):
 
-    def get_preserved_attr_names(self) -> List[str]:
+    def get_preserved_attr_names(self):
         return ['_activation_post_process_map',
                 '_patterns',
                 '_qconfig_map',
@@ -35,12 +35,6 @@ def is_observed_module(module: Any) -> bool:
     return isinstance(module, ObservedGraphModule)
 
 class ObservedStandaloneGraphModule(ObservedGraphModule):
-    def get_preserved_attr_names(self) -> List[str] :
-        return super().get_preserved_attr_names() + [
-            "_standalone_module_input_quantized_idxs",
-            "_standalone_module_output_quantized_idxs"
-        ]
-
     def __deepcopy__(self, memo):
         fake_mod = torch.nn.Module()
         fake_mod.__dict__ = copy.deepcopy(self.__dict__)
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index ed2f7e35659c..a1e601332d4a 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -753,10 +753,10 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
         qconfig = quantizer.qconfig_map[node.name]
         convert = torch.quantization.quantize_fx._convert_standalone_module_fx  # type: ignore
         observed_standalone_module = quantizer.modules[node.target]
-        input_quantized_idxs = observed_standalone_module._standalone_module_input_quantized_idxs
         quantized_standalone_module = convert(observed_standalone_module, debug=debug)
         parent_name, name = _parent_name(node.target)
         # update the modules dict
         setattr(quantizer.modules[parent_name], name, quantized_standalone_module)
         quantizer.modules[node.target] = quantized_standalone_module
-        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=input_quantized_idxs))
+        # standalone module takes float input
+        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=False))
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index d821f9610b7f..af9496a66a63 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -102,15 +102,14 @@ def insert_observer(
         'call_module', observer_name, (load_arg(node),), {})
     observed_node_names_set.add(node.name)
 
-def maybe_insert_observer_for_special_module(
+def insert_observer_for_special_module(
         quantize_handler: QuantizeHandler, modules: Dict[str, torch.nn.Module],
-        prepare_custom_config_dict: Any, qconfig: Any, node: Node) -> Optional[List[int]]:
+        prepare_custom_config_dict: Any, qconfig: Any, node: Node):
     """ Insert observer for custom module and standalone module
       Returns: standalone_module_input_idxs: the indexs for inputs that
       needs to be observed by parent module
     """
     assert modules is not None
-    standalone_module_input_idxs = None
     if isinstance(quantize_handler, CustomModuleQuantizeHandler):
         custom_module = modules[node.target]  # type: ignore
         custom_module_class_mapping = prepare_custom_config_dict.get(
@@ -130,22 +129,19 @@ def maybe_insert_observer_for_special_module(
         class_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_class_configs}
         name_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_name_configs}
         config = class_config_map.get(type(standalone_module), (None, None))
-        config = name_config_map.get(node.target, config)
-        sm_qconfig_dict = {"": qconfig} if config[0] is None else config[0]
-        sm_prepare_config_dict = {} if config[1] is None else config[1]
+        config = name_config_map.get(node.target, (None, None))
+        standalone_module_qconfig_dict = {"": qconfig} if config[0] is None else config[0]
+        standalone_prepare_config_dict = {} if config[1] is None else config[1]
         prepare = \
             torch.quantization.quantize_fx._prepare_standalone_module_fx  # type: ignore
         observed_standalone_module = \
-            prepare(standalone_module, sm_qconfig_dict, sm_prepare_config_dict)
-        standalone_module_input_idxs = observed_standalone_module.\
-            _standalone_module_input_quantized_idxs
+            prepare(standalone_module, standalone_module_qconfig_dict, standalone_prepare_config_dict)
         observed_standalone_module = mark_observed_standalone_module(
             observed_standalone_module)
         parent_name, name = _parent_name(node.target)
         setattr(modules[parent_name], name,
                 observed_standalone_module)
         modules[node.target] = observed_standalone_module  # type: ignore
-    return standalone_module_input_idxs
 
 def insert_observer_for_output_of_the_node(
         node: Node,
@@ -159,8 +155,7 @@ def insert_observer_for_output_of_the_node(
         observed_graph: Graph,
         load_arg: Callable,
         observed_node_names_set: Set[str],
-        matched_nodes: Optional[List[Node]],
-        standalone_module_input_idxs: Optional[List[int]]):
+        matched_nodes: Optional[List[Node]]):
     """ Insert observer/fake_quantize module for output of the observed
     module if needed
     """
@@ -220,11 +215,8 @@ def input_is_observed(arg):
                 observed_node_names_set.add(node.name)
         elif isinstance(quantize_handler,
                         StandaloneModuleQuantizeHandler):
-            assert node.op == "call_module"
-            output_is_quantized = 0 in \
-                modules[node.target]._standalone_module_output_quantized_idxs  # type: ignore
-            if output_is_quantized:
-                observed_node_names_set.add(node.name)
+            # output is observed in the standalone module
+            return
         elif (quantize_handler.all_node_args and
               input_output_observed(quantize_handler)):
             # observer for outputs
@@ -234,16 +226,6 @@ def input_is_observed(arg):
                 activation_post_process_map, env, observed_graph,
                 load_arg, observed_node_names_set)
 
-        # insert observer for input of standalone module
-        if standalone_module_input_idxs is not None:
-            for idx in standalone_module_input_idxs:
-                if node.args[idx].name not in observed_node_names_set:  # type: ignore
-                    new_observer = qconfig.activation()
-                    insert_observer(
-                        node, new_observer, model,
-                        activation_post_process_map, env, observed_graph,
-                        load_arg, observed_node_names_set)
-
 def insert_observer_for_input_arg_of_observed_node(
         node: Node, observed_node_names_set: Set[str],
         quants: Dict[str, Tuple[DefaultQuantizeHandler, Callable]],
@@ -391,19 +373,10 @@ def _prepare(self, model: GraphModule, qconfig_dict: Any,
         """ standalone_module means it a submodule that is not inlined in
         parent module, and will be quantized separately as one unit.
 
-        How the standalone module is observed is specified by `input_quantized_idxs` and
-        `output_quantized_idxs` in the prepare_custom_config for the standalone module
+        When we are preparing a standalone module:
+        both input and output are observed in prepared standalone module
         Returns:
             model(GraphModule): prepared standalone module
-            attributes:
-                _standalone_module_input_quantized_idxs(List[Int]): a list of
-                    indexes for the graph input that is expected to be quantized,
-                    same as input_quantized_idxs configuration provided
-                    for the standalone module
-                _standalone_module_output_quantized_idxs(List[Int]): a list of
-                    indexs for the graph output that is quantized
-                    same as input_quantized_idxs configuration provided
-                    for the standalone module
         """
         if prepare_custom_config_dict is None:
             prepare_custom_config_dict = {}
@@ -457,6 +430,8 @@ def _prepare(self, model: GraphModule, qconfig_dict: Any,
         def load_arg(a):
             return map_arg(a, lambda node: env[node.name])
 
+        # indexes for the inputs that needs to be observed
+        standalone_module_observed_input_idxs: List[int] = []
         graph_inputs = []
         for node in model.graph.nodes:
             if node.op == 'placeholder':
@@ -512,15 +487,14 @@ def load_arg(a):
                 # parent
                 if qconfig is not None:
                     assert obj is not None
-                    standalone_module_input_idxs = \
-                        maybe_insert_observer_for_special_module(
-                            obj, self.modules, prepare_custom_config_dict, qconfig,
-                            node)
+                    insert_observer_for_special_module(
+                        obj, self.modules, prepare_custom_config_dict, qconfig,
+                        node)
                     insert_observer_for_output_of_the_node(
                         node, obj, qconfig, self.modules, model, pattern,
                         self.activation_post_process_map, env,
                         observed_graph, load_arg, observed_node_names_set,
-                        matched_nodes, standalone_module_input_idxs)
+                        matched_nodes)
             else:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
 
@@ -542,19 +516,6 @@ def load_arg(a):
         model = GraphModule(model, observed_graph)
         self.save_state(model)
         model = mark_observed_module(model)
-        if is_standalone_module:
-            assert result_node is not None
-            assert isinstance(result_node.args[0], Node), \
-                "standalone module only supports returning simple value currently"\
-                "(not tuple, dict etc.)"
-            # indicator for whether output is observed or not.
-            # This used for correctly quantize standalone modules
-            output_is_observed = \
-                result_node.args[0].name in observed_node_names_set
-            # these inputs are observed in parent
-            model._standalone_module_input_quantized_idxs = \
-                input_quantized_idxs
-            model._standalone_module_output_quantized_idxs = output_quantized_idxs
         return model
 
     def save_state(self, observed: GraphModule) -> None:
@@ -608,10 +569,8 @@ def _convert(self, model: GraphModule, debug: bool = False,
         """ standalone_module means it a submodule that is not inlined in
         parent module, and will be quantized separately as one unit.
 
-        Returns a quantized standalone module, whether input/output is quantized is
-        specified by prepare_custom_config_dict, with
-        input_quantized_idxs, output_quantized_idxs, please
-        see docs for prepare_fx for details
+        Returns a quantized standalone module which accepts float input
+        and produces float output.
         """
         if convert_custom_config_dict is None:
             convert_custom_config_dict = {}
@@ -668,50 +627,36 @@ def load_x(n: Node) -> Node:
             else:
                 return env[n.name]
 
-        def load_arg(quantized: Optional[Union[List[int], bool, Tuple[int, ...]]]
+        def load_arg(quantized: Optional[Union[List[Any], bool, Tuple[Any, ...]]]
                      ) -> Callable[[Node], Argument]:
             """
             Input: quantized, which can be None, list, boolean or tuple
-              - if quantized is None, then we'll load the node as long as it
-                exists
-              - if quantized is a boolean, then all args will be
-                quantized/not quantized
-              - if quantized is an empty list or tuple, then it is the same as load_arg(quantized=False)
               - if quantized is a list or tuple, then arg should be a list and
                 the args with corresponding indexes will be quantized
+              - if quantized is a boolean, then all args will be
+                quantized/not quantized
+              - if quantized is None, then we'll load the node as long as it
+                exists
 
             Output: fn which takes arg_or_args, and loads them from the
                 corresponding environment depending on the value of quantized.
             """
             assert quantized is None or \
                 isinstance(quantized, (tuple, list, bool)), type(quantized)
-            if isinstance(quantized, (tuple, list)) and len(quantized) == 0:
-                # empty tuple or list means nothing is quantized
-                quantized = False
 
             def load_arg_impl(arg_or_args):
-                # we'll update the format of `quantized`
-                # to better match arg_or_args
-                updated_quantized: Optional[Union[List[int], bool, Tuple[int, ...]]] = quantized
-
-                if isinstance(quantized, (tuple, list)) and \
-                   len(quantized) == 1 and isinstance(arg_or_args, Node):
-                    # when argument is one Node instead of tuple, we just need to check
-                    # 0 is in the quantized list
-                    updated_quantized = 0 in quantized
-
-                if updated_quantized is None:
+                if quantized is None:
                     return map_arg(arg_or_args, load_x)
-                if isinstance(updated_quantized, bool):
+                if isinstance(quantized, bool):
                     return map_arg(
                         arg_or_args,
-                        load_quantized if updated_quantized else load_non_quantized)
-                elif isinstance(updated_quantized, (tuple, list)):
+                        load_quantized if quantized else load_non_quantized)
+                elif isinstance(quantized, (tuple, list)):
                     assert isinstance(arg_or_args, (tuple, list)), arg_or_args
                     loaded_args = []
                     # for now, we only support quantizing positional arguments
                     for i, a in enumerate(arg_or_args):
-                        if i in updated_quantized:
+                        if i in quantized:
                             loaded_args.append(map_arg(a, load_quantized))
                         else:
                             loaded_args.append(map_arg(a, load_non_quantized))
@@ -745,10 +690,10 @@ def node_arg_is_quantized(node_arg: Any) -> bool:
         def is_output_quantized(node: Node, obj: QuantizeHandler) -> bool:
             """ Check if output node is quantized or not """
             assert self.modules is not None
-            # by default the output for a quantizable node is expected to be quantized
+            # by default the output is expected to be quantized
             quantized = True
 
-            # Need to get correct quantized/non-quantized state forn the output
+            # Need to get correct quantized/non-quantized state for the output
             # of CopyNode
             if type(obj) in [
                     CopyNode,
@@ -805,7 +750,7 @@ def insert_quantize_node(node: Node) -> None:
             "output_quantized_idxs", [])
 
         for node in model.graph.nodes:
-            if node.op == "output":
+            if node.op == 'output':
                 cur_output_node_idx = output_node_seen_cnt
                 output_node_seen_cnt += 1
                 if cur_output_node_idx in output_quantized_idxs:
@@ -830,19 +775,12 @@ def insert_quantize_node(node: Node) -> None:
                     quantized = False
                 else:
                     assert obj is not None
-                    # We will get whether the output is quantized or not before
-                    # convert for standalone module and after convert
-                    # for non-standalone module, since _standalone_module_output_quantized_idxs
-                    # is only available in observed standalone module
-                    if is_observed_standalone_module_node:
-                        out_quant_idxs = self.modules[node.target]._standalone_module_output_quantized_idxs
-                        assert len(out_quant_idxs) <= 1, "Currently standalone only support one output"
-                        quantized = 0 in out_quant_idxs
-
                     result = obj.convert(
                         self, node, load_arg, debug=debug,
                         convert_custom_config_dict=convert_custom_config_dict)
-                    if not is_observed_standalone_module_node:
+                    if is_observed_standalone_module_node:
+                        quantized = False
+                    else:
                         quantized = is_output_quantized(node, obj)
 
                 if quantized:
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 89ba877ffe78..cba104b8f783 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -107,20 +107,8 @@ def _prepare_standalone_module_fx(
     standalone_module means it a submodule that is not inlined in parent module,
         and will be quantized separately as one unit.
 
-    How the standalone module is observed is specified by `input_quantized_idxs` and
-    `output_quantized_idxs` in the prepare_custom_config for the standalone module
-
-    Returns:
-        model(GraphModule): prepared standalone module
-        attributes:
-            _standalone_module_input_quantized_idxs(List[Int]): a list of
-                indexes for the graph input that is expected to be quantized,
-                same as input_quantized_idxs configuration provided
-                for the standalone module
-            _standalone_module_output_quantized_idxs(List[Int]): a list of
-                indexs for the graph output that is quantized
-                same as input_quantized_idxs configuration provided
-                for the standalone module
+    Both input and output of the module are observed in the
+    standalone module.
     """
     return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, is_standalone_module=True)
 
@@ -390,9 +378,8 @@ def _convert_standalone_module_fx(
     r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx`
     and convert it to a quantized model
 
-    Returns a quantized standalone module, whether input/output is quantized is
-    specified by prepare_custom_config_dict, with
-    input_quantized_idxs, output_quantized_idxs, please
-    see docs for prepare_fx for details
+    Return:
+        A quantized standalone module which accepts float input
+        and produces float output.
     """
     return _convert_fx(graph_module, debug, convert_custom_config_dict, is_standalone_module=True)

From 963f7629b591dc9750476faf1513bc7f1fb4d6de Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Thu, 24 Dec 2020 22:40:46 -0800
Subject: [PATCH 45/45] [numpy] `torch.digamma` : promote integer inputs to
 float (#48302)

Summary:
**BC-breaking Note:**

This PR updates PyTorch's digamma function to be consistent with SciPy's special.digamma function. This changes the result of the digamma function on the nonpositive integers, where the gamma function is not defined. Since the gamma function is undefined at these points, the (typical) derivative of the logarithm of the gamma function is also undefined at these points, and for negative integers this PR updates digamma to return NaN. For zero, however, it returns -inf to be consistent with SciPy.

Interestingly, SciPy made a similar change, which was noticed by at least one user: https://github.com/scipy/scipy/issues/9663#issue-396587679.

SciPy's returning of negative infinity at zero is intentional:
https://github.com/scipy/scipy/blob/59347ae8b86bcc92c339efe213128f64ab6df98c/scipy/special/cephes/psi.c#L163

This change is consistent with the C++ standard for the gamma function:
https://en.cppreference.com/w/cpp/numeric/math/tgamma

**PR Summary:**
Reference https://github.com/pytorch/pytorch/issues/42515

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48302

Reviewed By: ngimel

Differential Revision: D25664087

Pulled By: mruberry

fbshipit-source-id: 1168e81e218bf9fe5b849db0e07e7b22e590cf73
---
 aten/src/ATen/native/Math.h                   | 22 ++++++--
 aten/src/ATen/native/UnaryOps.cpp             |  4 +-
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp   |  2 +-
 aten/src/ATen/native/cuda/Math.cuh            | 11 +++-
 .../src/ATen/native/cuda/UnaryGammaKernels.cu |  2 +-
 test/test_torch.py                            |  1 -
 test/test_unary_ufuncs.py                     | 56 ++++++++++---------
 torch/_torch_docs.py                          |  8 ++-
 .../_internal/common_methods_invocations.py   | 13 +++++
 9 files changed, 76 insertions(+), 43 deletions(-)

diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h
index 04c6925933a3..6cd0464de921 100644
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@@ -277,15 +277,20 @@ static inline float trigamma(float x) {
  * See note [3-Clause BSD License for the Cephes Math Library].
  */
 static inline double calc_digamma(double x) {
+  // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
   static double PSI_10 = 2.25175258906672110764;
   if (x == 0) {
-    return INFINITY;
+    // As per C++ standard for gamma related functions and SciPy,
+    // If the argument is ±0, ±∞ is returned
+    return std::copysign(INFINITY, -x);
   }
 
-  int x_is_integer = x == floor(x);
+  bool x_is_integer = x == trunc(x);
   if (x < 0) {
     if (x_is_integer) {
-      return INFINITY;
+      // As per C++ standard for gamma related functions and SciPy,
+      // If the argument is a negative integer, NaN is returned
+      return NAN;
     }
     return calc_digamma(1 - x) - M_PI / tan(M_PI * x);
   }
@@ -324,15 +329,20 @@ static inline double calc_digamma(double x) {
  * See note [3-Clause BSD License for the Cephes Math Library].
  */
 static inline float calc_digamma(float x) {
+  // See [C++ Standard Reference: Gamma Function]
   static float PSI_10 = 2.25175258906672110764f;
   if (x == 0) {
-    return INFINITY;
+    // As per C++ standard for gamma related functions and SciPy,
+    // If the argument is ±0, ±∞ is returned
+    return std::copysign(INFINITY, -x);
   }
 
-  int x_is_integer = x == floorf(x);
+  bool x_is_integer = x == truncf(x);
   if (x < 0) {
     if (x_is_integer) {
-      return INFINITY;
+    // As per C++ standard for gamma related functions and SciPy,
+    // If the argument is a negative integer, NaN is returned
+      return NAN;
     }
     // Avoid rounding errors for `tan`'s input.
     // Those make a big difference at extreme values.
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 4eb1f393e47c..e6dd1bc4afde 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -318,8 +318,8 @@ Tensor& round_out(Tensor& result, const Tensor& self) { return unary_op_impl_out
 Tensor round(const Tensor& self) { return unary_op_impl(self, at::round_out); }
 Tensor& round_(Tensor& self) { return unary_op_impl_(self, at::round_out); }
 
-Tensor& digamma_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, digamma_stub); }
-Tensor digamma(const Tensor& self) { return unary_op_impl(self, digamma_out); }
+Tensor& digamma_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, digamma_stub); }
+Tensor digamma(const Tensor& self) { return unary_op_impl_float(self, digamma_stub); }
 Tensor& digamma_(Tensor& self) { return unary_op_impl_(self, digamma_out); }
 
 Tensor& reciprocal_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, reciprocal_stub); }
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 42a761439ac0..049b3eff6b5b 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -360,7 +360,7 @@ static void atanh_kernel(TensorIterator& iter) {
 }
 
 static void digamma_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "digamma", [&]() {
+  AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "digamma", [&]() {
     cpu_kernel(
         iter,
         [=](scalar_t a) -> scalar_t { return calc_digamma(a); });
diff --git a/aten/src/ATen/native/cuda/Math.cuh b/aten/src/ATen/native/cuda/Math.cuh
index 1daba76c9446..17c30cd00ea7 100644
--- a/aten/src/ATen/native/cuda/Math.cuh
+++ b/aten/src/ATen/native/cuda/Math.cuh
@@ -93,6 +93,7 @@ static inline __host__ __device__ scalar_t zeta(scalar_t _x, scalar_t _q) {
  */
 template <typename scalar_t>
 static inline __host__ __device__ scalar_t calc_digamma(scalar_t in) {
+  // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   static const double PI_f64 = 3.14159265358979323846;
   const accscalar_t PSI_10 = 2.25175258906672110764;
@@ -108,14 +109,18 @@ static inline __host__ __device__ scalar_t calc_digamma(scalar_t in) {
 
   accscalar_t x = static_cast<accscalar_t>(in);
   if (x == 0) {
-    return static_cast<scalar_t>(INFINITY);
+    // As per C++ standard for gamma related functions and SciPy,
+    // If the argument is ±0, ±∞ is returned
+    return std::copysign(static_cast<scalar_t>(INFINITY), -x);
   }
 
-  bool x_is_integer = x == ::floor(x);
+  bool x_is_integer = x == ::trunc(x);
   accscalar_t result = 0;
   if (x < 0) {
     if (x_is_integer) {
-      return static_cast<scalar_t>(INFINITY);
+      // As per C++ standard for gamma related functions and SciPy,
+      // If the argument is a negative integer, NaN is returned
+      return static_cast<scalar_t>(NAN);
     }
     // Rounding errors in tan's input can really affect the output
     // for extreme values, so we always perform this computation in double.
diff --git a/aten/src/ATen/native/cuda/UnaryGammaKernels.cu b/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
index d752d606474d..97dbeefccc77 100644
--- a/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
@@ -11,7 +11,7 @@
 namespace at { namespace native {
 
 void digamma_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "digamma_cuda", [&]() {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "digamma_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return calc_digamma(a);
     });
diff --git a/test/test_torch.py b/test/test_torch.py
index 04fadcb65c66..6532c2e5e17d 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6925,7 +6925,6 @@ def inner(self, device, dtype):
     ('trunc', '', _small_3d, lambda t, d: [], 1e-5, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
     ('ceil', '', _small_3d, lambda t, d: [], 1e-5, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
     ('lgamma', '', _small_3d, lambda t, d: [], 1e-2, 1e-1, 1e-5, _float_types_no_half, [torch.bfloat16]),
-    ('digamma', 'op', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e0, _float_types_no_half),
 ]
 
 # Creates and decorates a generic test and adds it to the class.
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 37ef90514803..776482306f4d 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -497,6 +497,35 @@ def test_sqrt_complex_edge_values(self, device, dtype):
         x = torch.tensor(-1.0000e+20 - 4988429.2000j, dtype=dtype, device=device)
         self.compare_with_numpy(torch.sqrt, np.sqrt, x)
 
+    @unittest.skipIf(not TEST_SCIPY, "Requires SciPy")
+    @dtypes(torch.float, torch.double)
+    def test_digamma_special(self, device, dtype):
+        # Based on SciPy test for the following special values.
+        # Reference:
+        # https://github.com/scipy/scipy/blob/3a8a3a1d4657254a6611e77e9c28feafa26e6645/scipy/special/tests/test_digamma.py#L22
+        euler = 0.57721566490153286
+        dataset = [(0., -0.),
+                   (1, -euler),
+                   (0.5, -2 * math.log(2) - euler),
+                   (1 / 3, -math.pi / (2 * math.sqrt(3)) - 3 * math.log(3) / 2 - euler),
+                   (1 / 4, -math.pi / 2 - 3 * math.log(2) - euler),
+                   (1 / 6, -math.pi * math.sqrt(3) / 2 - 2 * math.log(2) - 3 * math.log(3) / 2 - euler),
+                   (1 / 8, -math.pi / 2 - 4 * math.log(2) -
+                       (math.pi + math.log(2 + math.sqrt(2)) - math.log(2 - math.sqrt(2))) / math.sqrt(2) - euler)]
+        x = torch.tensor(dataset, device=device, dtype=dtype)
+        self.compare_with_numpy(torch.digamma, scipy.special.digamma, x)
+
+    @unittest.skipIf(not TEST_SCIPY, "Requires SciPy")
+    @dtypes(torch.float, torch.double)
+    def test_digamma(self, device, dtype):
+        # Tests pole behavior
+        # TODO: Add value `-1931.99999994`, to the tensor below when
+        # https://github.com/pytorch/pytorch/issues/49015 is fixed
+        tensor = torch.tensor([-0.999999994, -1.999999994, -2.0000000111,
+                               -100.99999994, 0.000000111,
+                               -0.000000111, 0, -0, -1, -2, -931], dtype=dtype, device=device)
+        self.compare_with_numpy(torch.digamma, scipy.special.digamma, tensor)
+
     # TODO opinfo mvlgamma
     @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
     def test_mvlgamma(self, device):
@@ -1120,30 +1149,6 @@ def test_polygamma(self, device, dtype):
             torch.autograd.gradcheck(lambda x: x.polygamma(n),
                                      cpu_tensor)
 
-    # Note: fails when using float tensors
-    # TODO: update this test to just compare against NumPy
-    @onlyCUDA
-    @dtypes(torch.double)
-    def test_digamma(self, device, dtype):
-        cpu_tensor = torch.randn(10, 10, 10, dtype=dtype)
-        device_tensor = cpu_tensor.to(device)
-        zeros = torch.zeros(10, 10, 10, dtype=dtype)
-        cpu_out = cpu_tensor.digamma()
-        device_out = device_tensor.digamma()
-        norm_errors = (device_out - cpu_out.to(device)) / device_out
-        self.assertEqual(norm_errors, zeros)
-
-        # Tests pole behavior
-        cpu_tensor = torch.tensor([-0.999999994, -1.999999994, -2.0000000111,
-                                   -100.99999994, -1931.99999994, 0.000000111,
-                                   -0.000000111, 0, -1, -2, -931], dtype=dtype)
-        expected_errors = torch.tensor([0, 0, 0, 0, 0, 0, 0, nan, nan, nan, nan], dtype=dtype)
-        device_tensor = cpu_tensor.to(device)
-        cpu_out = cpu_tensor.digamma()
-        device_out = device_tensor.digamma()
-        norm_errors = (device_out - cpu_out.to(device)) / device_out
-        self.assertEqual(norm_errors, expected_errors)
-
     # TODO: update to compare against NumPy by rationalizing with OpInfo
     @onlyCUDA
     @dtypes(torch.float, torch.double)
@@ -1725,9 +1730,6 @@ def _medium_2d(dtype, device):
     _TorchMathTestMeta('polygamma', args=[2], substr='_2', reffn='polygamma',
                        refargs=lambda x: (2, x.numpy()), input_fn=_generate_gamma_input, inputargs=[False],
                        ref_backend='scipy', rtol=0.0008, atol=1e-5),
-    _TorchMathTestMeta('digamma',
-                       input_fn=_generate_gamma_input, inputargs=[True], ref_backend='scipy',
-                       replace_inf_with_nan=True),
     _TorchMathTestMeta('abs', input_fn=_medium_2d, dtypes=_types_no_half, rtol=0., atol=0.),
     _TorchMathTestMeta('logit', ref_backend='scipy')]
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index d46a6b1bcf84..e05784cbcc22 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -2532,8 +2532,7 @@ def merge_dicts(*dicts):
              [ 1.0500,  0.7336, -0.3836, -1.1015]]])
 """.format(**common_args))
 
-add_docstr(torch.digamma,
-           r"""
+add_docstr(torch.digamma, r"""
 digamma(input, *, out=None) -> Tensor
 
 Computes the logarithmic derivative of the gamma function on `input`.
@@ -2547,6 +2546,11 @@ def merge_dicts(*dicts):
 Keyword args:
     {out}
 
+.. note::  This function is similar to SciPy's `scipy.special.digamma`.
+
+.. note::  From PyTorch 1.8 onwards, the digamma function returns `-Inf` for `0`. 
+           Previously it returned `NaN` for `0`.
+
 Example::
 
     >>> a = torch.tensor([1, 0.5])
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 808506dc6809..87d0baa895e8 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1087,6 +1087,19 @@ def reference_sigmoid(x):
                        promotes_integers_to_float=True,
                        assert_autodiffed=True,
                        test_complex_grad=False),  # Reference: https://github.com/pytorch/pytorch/issues/48552
+        UnaryUfuncInfo('digamma',
+                       ref=scipy.special.digamma,
+                       decorators=(precisionOverride({torch.float16: 5e-1}),),
+                       dtypes=all_types_and(torch.bool),
+                       dtypesIfCPU=all_types_and(torch.bool),
+                       dtypesIfCUDA=all_types_and(torch.bool, torch.half),
+                       skips=(
+                           # In some cases, output is NaN (for input close to
+                           # negative integers) especially due to reduced precision
+                           # in float16 and NaN's can't be tested for equality.
+                           SkipInfo('TestCommon', 'test_variant_consistency_jit',
+                                    device_type='cuda', dtypes=[torch.float16]),),
+                       promotes_integers_to_float=True),
         UnaryUfuncInfo('erf',
                        ref=scipy.special.erf,
                        decorators=(precisionOverride({torch.float16: 1e-2,