From 06dfaadfc6357ed909ed15c7ef79d503c49d9475 Mon Sep 17 00:00:00 2001
From: Charles David Hernandez <cdhernandez@fb.com>
Date: Thu, 8 Jul 2021 17:16:42 -0700
Subject: [PATCH 001/122] update internal function names that apply to both cpu
 and cuda (#59701)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/59701

These functions have been updated to work for cpu and cuda, their names are now changed to reflect that

quantize_per_channel_cpu -> quantize_per_channel
dequantize_quantized_cpu -> dequantize_quantized

(Note: this ignores all push blocking failures!)

Test Plan:
python test/test_quantization.py TestQuantizedTensor

Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D29018270

fbshipit-source-id: 3a0da8d5e3f357dcf19119bcdbc6172d41f2b0c1
---
 .../ATen/benchmarks/quantize_per_channel.cpp  |   6 +-
 aten/src/ATen/native/native_functions.yaml    |   4 +-
 aten/src/ATen/native/quantized/QTensor.cpp    |   4 +-
 .../native/quantized/affine_quantizer.cpp     |   8 +-
 aten/src/ATen/native/quantized/cpu/qconv.cpp  |   2 +-
 .../src/ATen/native/quantized/cpu/qlinear.cpp |   2 +-
 .../native/quantized/cuda/affine_quantizer.cu | 235 ++++++++++--------
 aten/src/ATen/test/quantized_test.cpp         |   4 +-
 .../core/test_quantized_tensor.py             |   2 +-
 9 files changed, 153 insertions(+), 114 deletions(-)
diff --git a/aten/src/ATen/benchmarks/quantize_per_channel.cpp b/aten/src/ATen/benchmarks/quantize_per_channel.cpp
index 375fad4e6b135..3b72428c1558c 100644
--- a/aten/src/ATen/benchmarks/quantize_per_channel.cpp
+++ b/aten/src/ATen/benchmarks/quantize_per_channel.cpp
@@ -16,7 +16,7 @@ static void quantize_per_channel_4d_contiguous(benchmark::State& state) {
 
   at::Tensor qa;
   for (auto _ : state) {
-    qa = at::native::quantize_per_channel_cpu(
+    qa = at::native::quantize_per_channel(
         a, scales, zero_points, 1, at::ScalarType::QUInt8);
   }
 }
@@ -36,7 +36,7 @@ static void quantize_per_channel_4d_channels_last(benchmark::State& state) {
 
   at::Tensor qa;
   for (auto _ : state) {
-    qa = at::native::quantize_per_channel_cpu(
+    qa = at::native::quantize_per_channel(
         a, scales, zero_points, 1, at::ScalarType::QUInt8);
   }
 }
@@ -52,7 +52,7 @@ static void quantize_per_channel_2d(benchmark::State& state) {
 
   at::Tensor qa;
   for (auto _ : state) {
-    qa = at::native::quantize_per_channel_cpu(
+    qa = at::native::quantize_per_channel(
         a, scales, zero_points, 0, at::ScalarType::QUInt8);
   }
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index ae903f6f28e4c..e8c5dc9e8f00c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -5159,13 +5159,13 @@
 - func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor
   variants: function
   dispatch:
-    CPU, CUDA: quantize_per_channel_cpu
+    CPU, CUDA: quantize_per_channel
 
 - func: dequantize.self(Tensor self) -> Tensor
   variants: function, method
   dispatch:
     CPU: dequantize_cpu
-    QuantizedCPU, QuantizedCUDA: dequantize_quantized_cpu
+    QuantizedCPU, QuantizedCUDA: dequantize_quantized
 
 - func: dequantize.tensors(Tensor[] tensors) -> Tensor[]
   variants: function
diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index ce71acf5a5040..8220fc1faba0c 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -45,7 +45,7 @@ std::vector<Tensor> quantize_per_tensor_list_cpu(
   return quantized_tensors;
 }
 
-Tensor quantize_per_channel_cpu(
+Tensor quantize_per_channel(
     const Tensor& self,
     const Tensor& scales,
     const Tensor& zero_points,
@@ -59,7 +59,7 @@ Tensor dequantize_cpu(const Tensor& self) {
   return self.to(at::kFloat);
 }
 
-Tensor dequantize_quantized_cpu(const Tensor& self) {
+Tensor dequantize_quantized(const Tensor& self) {
   return get_qtensorimpl(self)->quantizer()->dequantize(self);
 }
 
diff --git a/aten/src/ATen/native/quantized/affine_quantizer.cpp b/aten/src/ATen/native/quantized/affine_quantizer.cpp
index 47b9ae6ac5d72..9b01c3a5294b9 100644
--- a/aten/src/ATen/native/quantized/affine_quantizer.cpp
+++ b/aten/src/ATen/native/quantized/affine_quantizer.cpp
@@ -81,13 +81,13 @@ void checkZeroPoint(const std::string& fn_name, int64_t zero_point) {
       fn_name,
       " zero_point ",
       zero_point,
-      " is out of range.");
+      " is above upper bound.");
   TORCH_CHECK(
       zero_point >= std::numeric_limits<T>::min(),
       fn_name,
       " zero_point ",
       zero_point,
-      " is out of range.");
+      " is below lower bound.");
 }
 
 template <typename T>
@@ -157,7 +157,7 @@ Tensor& quantize_tensor_per_channel_affine(
     checkQuantizedTensor<scalar_t>(fn_name, qtensor);
     if(qtensor.device().type() != c10::DeviceType::CUDA){
       checkZeroPoints<underlying_t>(fn_name, zero_points);
-    }
+    }  // for cuda, this check will occur in the actual cuda function
   });
 
   TORCH_CHECK(
@@ -261,7 +261,7 @@ Tensor& dequantize_tensor_per_channel_affine(
     checkQuantizedTensor<scalar_t>(fn_name, qtensor);
     if(qtensor.device().type() != c10::DeviceType::CUDA){
       checkZeroPoints<underlying_t>(fn_name, zero_points);
-    }
+    }  // for cuda, this check will occur in the actual cuda function
   });
 
   TORCH_CHECK(
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index bc4b0434e366a..bf5c596a9e0d2 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -648,7 +648,7 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
       at::Tensor bias_quant_scales =
           weight_contig.q_per_channel_scales() * act_input_scale;
       at::Tensor bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt);
-      qbias = at::native::quantize_per_channel_cpu(
+      qbias = at::native::quantize_per_channel(
           bias_fp32, bias_quant_scales, bias_zp, 0, c10::kQInt32);
     } else {
       qbias = at::native::quantize_per_tensor(
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 4e49035abcff7..9f3bb4b9d2773 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -312,7 +312,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl(
       at::Tensor bias_quant_scales =
           weight_contig.q_per_channel_scales() * input_scale;
       at::Tensor bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt);
-      qbias = at::native::quantize_per_channel_cpu(
+      qbias = at::native::quantize_per_channel(
           bias_fp32, bias_quant_scales, bias_zp, 0, c10::kQInt32);
     } else {
       qbias = at::native::quantize_per_tensor(
diff --git a/aten/src/ATen/native/quantized/cuda/affine_quantizer.cu b/aten/src/ATen/native/quantized/cuda/affine_quantizer.cu
index babfd9d5352eb..f8e41e5112573 100644
--- a/aten/src/ATen/native/quantized/cuda/affine_quantizer.cu
+++ b/aten/src/ATen/native/quantized/cuda/affine_quantizer.cu
@@ -1,15 +1,33 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/quantized/affine_quantizer.h>
 #include <math.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/native/cuda/Loops.cuh>
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
 
 namespace at {
 namespace native {
 namespace {
 
+template <typename T>
+void check_zero_points_cuda(
+    const std::string& fn_name,
+    const Tensor& zero_points) {
+  constexpr int64_t qmin = std::numeric_limits<T>::min();
+  constexpr int64_t qmax = std::numeric_limits<T>::max();
+  auto zp_within_upper = at::any(at::gt(zero_points, qmax)).item().equal(false);
+  auto zp_within_lower = at::any(at::lt(zero_points, qmin)).item().equal(false);
+  TORCH_CHECK(
+    zp_within_lower,
+    fn_name,
+    "zero_point is below lower bound.");
+  TORCH_CHECK(
+    zp_within_upper,
+    fn_name,
+    "zero_point is above upper bound.");
+}
+
 void quantize_tensor_per_tensor_affine_cuda(
     const Tensor& rtensor,
     Tensor& qtensor,
@@ -21,12 +39,11 @@ void quantize_tensor_per_tensor_affine_cuda(
         constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
 
         auto iter = TensorIteratorConfig()
-            .check_all_same_dtype(false)
-            .add_output(qtensor)
-            .add_input(rtensor)
-            .add_input(qtensor)
-            .build();
-
+                        .check_all_same_dtype(false)
+                        .add_output(qtensor)
+                        .add_input(rtensor)
+                        .add_input(qtensor)
+                        .build();
         gpu_kernel(
             iter,
             [=] GPU_LAMBDA(float raw_val, scalar_t quantized_val) -> scalar_t {
@@ -48,10 +65,10 @@ void dequantize_tensor_per_tensor_affine_cuda(
   AT_DISPATCH_QINT_TYPES(
       qtensor.scalar_type(), "dequantize_tensor_per_tensor_affine_cuda", [&]() {
         auto iter = TensorIteratorConfig()
-          .check_all_same_dtype(false)
-          .add_output(rtensor)
-          .add_input(qtensor)
-          .build();
+                        .check_all_same_dtype(false)
+                        .add_output(rtensor)
+                        .add_input(qtensor)
+                        .build();
         gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t value) -> float {
           return (static_cast<float>(value.val_) - zero_point) * scale;
         });
@@ -59,12 +76,12 @@ void dequantize_tensor_per_tensor_affine_cuda(
 }
 
 void quantize_tensor_per_channel_affine_cuda(
-  const Tensor& rtensor,
-  Tensor& qtensor,
-  const Tensor& scales,
-  const Tensor& zero_points,
-  int64_t axis) {
-
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis) {
+  static const std::string fn_name = "quantize_tensor_per_channel_affine_cuda";
   std::vector<int64_t> expected_shape(rtensor.dim(), 1);
   expected_shape[axis] = rtensor.size(axis);
 
@@ -72,18 +89,18 @@ void quantize_tensor_per_channel_affine_cuda(
   auto shaped_zero_points = native::_unsafe_view(zero_points, expected_shape);
 
   auto iter = TensorIteratorConfig()
-      .check_all_same_dtype(false)
-      .add_output(qtensor)
-      .add_input(rtensor)
-      .add_input(qtensor)
-      .add_input(shaped_scales)
-      .add_input(shaped_zero_points)
-      .build();
-
-
+                  .check_all_same_dtype(false)
+                  .add_output(qtensor)
+                  .add_input(rtensor)
+                  .add_input(qtensor)
+                  .add_input(shaped_scales)
+                  .add_input(shaped_zero_points)
+                  .build();
 
   AT_DISPATCH_QINT_TYPES(
-    qtensor.scalar_type(), "quantize_tensor_per_channel_affine_cuda_handler", [&]() {
+    qtensor.scalar_type(), fn_name, [&]() {
+      check_zero_points_cuda<underlying_t>(fn_name, zero_points);
+
       constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
       constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
       // trying to match _quantize_per_channel_ref_nd in test_quantized_tensor.py
@@ -102,12 +119,12 @@ void quantize_tensor_per_channel_affine_cuda(
 }
 
 void dequantize_tensor_per_channel_affine_cuda(
-  const Tensor& qtensor,
-  Tensor& rtensor,
-  const Tensor& scales,
-  const Tensor& zero_points,
-  int64_t axis) {
-
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis) {
+  static const std::string fn_name = "dequantize_tensor_per_channel_affine_cuda";
   std::vector<int64_t> expected_shape(rtensor.dim(), 1);
   expected_shape[axis] = rtensor.size(axis);
 
@@ -115,28 +132,35 @@ void dequantize_tensor_per_channel_affine_cuda(
   auto shaped_zero_points = native::_unsafe_view(zero_points, expected_shape);
 
   AT_DISPATCH_QINT_TYPES(
-    qtensor.scalar_type(), "dequantize_tensor_per_channel_affine_cuda_handler", [&]() {
-      auto iter = TensorIteratorConfig()
-        .check_all_same_dtype(false)
-        .add_output(rtensor)
-        .add_input(qtensor)
-        .add_input(shaped_scales)
-        .add_input(shaped_zero_points)
-        .build();
-
-      gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t value, double scale, int64_t zero_point) -> float {
-        return static_cast<float>(value.val_ - zero_point) * scale;
+      qtensor.scalar_type(),
+      fn_name,
+      [&]() {
+        check_zero_points_cuda<underlying_t>(fn_name, zero_points);
+
+        auto iter = TensorIteratorConfig()
+                        .check_all_same_dtype(false)
+                        .add_output(rtensor)
+                        .add_input(qtensor)
+                        .add_input(shaped_scales)
+                        .add_input(shaped_zero_points)
+                        .build();
+
+        gpu_kernel(
+            iter,
+            [=] GPU_LAMBDA(
+                scalar_t value, double scale, int64_t zero_point) -> float {
+              return static_cast<float>(value.val_ - zero_point) * scale;
+            });
       });
-    });
 }
 
 void quantize_tensor_per_channel_float_qparams_cuda(
-  const Tensor& rtensor,
-  Tensor& qtensor,
-  const Tensor& scales,
-  const Tensor& zero_points,
-  int64_t axis) {
-
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis) {
+  static const std::string fn_name = "quantize_tensor_per_channel_float_qparams_cuda";
   std::vector<int64_t> expected_shape(rtensor.dim(), 1);
   expected_shape[axis] = rtensor.size(axis);
 
@@ -144,39 +168,47 @@ void quantize_tensor_per_channel_float_qparams_cuda(
   auto shaped_zero_points = native::_unsafe_view(zero_points, expected_shape);
 
   auto iter = TensorIteratorConfig()
-      .check_all_same_dtype(false)
-      .add_output(qtensor)
-      .add_input(rtensor)
-      .add_input(qtensor)
-      .add_input(shaped_scales)
-      .add_input(shaped_zero_points)
-      .build();
+                  .check_all_same_dtype(false)
+                  .add_output(qtensor)
+                  .add_input(rtensor)
+                  .add_input(qtensor)
+                  .add_input(shaped_scales)
+                  .add_input(shaped_zero_points)
+                  .build();
 
   AT_DISPATCH_QINT_TYPES(
-    qtensor.scalar_type(), "quantize_tensor_per_channel_float_qparams_cuda_handler", [&]() {
-      constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
-      constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
-      // trying to match _quantize_per_channel_ref_nd in test_quantized_tensor.py
-      gpu_kernel(
-          iter,
-          [=] GPU_LAMBDA(float raw_val, scalar_t quantized_val, float scale, float zero_point) -> scalar_t {
-            float inv_scale = 1.0f/scale;
-            int64_t qvalue = lrintf(raw_val*inv_scale + zero_point);
-            qvalue = std::max<int64_t>(qvalue, qmin);
-            qvalue = std::min<int64_t>(qvalue, qmax);
-            quantized_val.val_ = qvalue;
-            return quantized_val;
-          });
-    });
+      qtensor.scalar_type(),
+      fn_name,
+      [&]() {
+        check_zero_points_cuda<underlying_t>(fn_name, zero_points);
+
+        constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
+        constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
+        // trying to match _quantize_per_channel_ref_nd in
+        gpu_kernel(
+            iter,
+            [=] GPU_LAMBDA(
+                float raw_val,
+                scalar_t quantized_val,
+                float scale,
+                float zero_point) -> scalar_t {
+              float inv_scale = 1.0f / scale;
+              int64_t qvalue = lrintf(raw_val * inv_scale + zero_point);
+              qvalue = std::max<int64_t>(qvalue, qmin);
+              qvalue = std::min<int64_t>(qvalue, qmax);
+              quantized_val.val_ = qvalue;
+              return quantized_val;
+            });
+      });
 }
 
 void dequantize_tensor_per_channel_float_qparams_cuda(
-  const Tensor& qtensor,
-  Tensor& rtensor,
-  const Tensor& scales,
-  const Tensor& zero_points,
-  int64_t axis) {
-
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis) {
+  static const std::string fn_name = "dequantize_tensor_per_channel_float_qparams_cuda";
   std::vector<int64_t> expected_shape(rtensor.dim(), 1);
   expected_shape[axis] = rtensor.size(axis);
 
@@ -184,19 +216,26 @@ void dequantize_tensor_per_channel_float_qparams_cuda(
   auto shaped_zero_points = native::_unsafe_view(zero_points, expected_shape);
 
   AT_DISPATCH_QINT_TYPES(
-    qtensor.scalar_type(), "dequantize_tensor_per_channel_float_qparams_cuda_handler", [&]() {
-      auto iter = TensorIteratorConfig()
-        .check_all_same_dtype(false)
-        .add_output(rtensor)
-        .add_input(qtensor)
-        .add_input(shaped_scales)
-        .add_input(shaped_zero_points)
-        .build();
-
-      gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t value, float scale, float zero_point) -> float {
-        return (static_cast<float>(value.val_) - zero_point) * scale;
+      qtensor.scalar_type(),
+      fn_name,
+      [&]() {
+        check_zero_points_cuda<underlying_t>(fn_name, zero_points);
+
+        auto iter = TensorIteratorConfig()
+                        .check_all_same_dtype(false)
+                        .add_output(rtensor)
+                        .add_input(qtensor)
+                        .add_input(shaped_scales)
+                        .add_input(shaped_zero_points)
+                        .build();
+
+        gpu_kernel(
+            iter,
+            [=] GPU_LAMBDA(
+                scalar_t value, float scale, float zero_point) -> float {
+              return (static_cast<float>(value.val_) - zero_point) * scale;
+            });
       });
-    });
 }
 
 } // anonymous namespace
@@ -211,13 +250,13 @@ REGISTER_DISPATCH(
     quantize_tensor_per_channel_affine_stub,
     &quantize_tensor_per_channel_affine_cuda);
 REGISTER_DISPATCH(
-  dequantize_tensor_per_channel_affine_stub,
-  &dequantize_tensor_per_channel_affine_cuda);
+    dequantize_tensor_per_channel_affine_stub,
+    &dequantize_tensor_per_channel_affine_cuda);
 REGISTER_DISPATCH(
-  quantize_tensor_per_channel_float_qparams_stub,
-  &quantize_tensor_per_channel_float_qparams_cuda);
+    quantize_tensor_per_channel_float_qparams_stub,
+    &quantize_tensor_per_channel_float_qparams_cuda);
 REGISTER_DISPATCH(
-  dequantize_tensor_per_channel_float_qparams_stub,
-  &dequantize_tensor_per_channel_float_qparams_cuda);
+    dequantize_tensor_per_channel_float_qparams_stub,
+    &dequantize_tensor_per_channel_float_qparams_cuda);
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/test/quantized_test.cpp b/aten/src/ATen/test/quantized_test.cpp
index a92720a282827..ee59f87f406a7 100644
--- a/aten/src/ATen/test/quantized_test.cpp
+++ b/aten/src/ATen/test/quantized_test.cpp
@@ -165,7 +165,7 @@ TEST(TestQTensor, QuantizePerChannel4d) {
     }
   }
   // quantize and check values
-  Tensor q = at::native::quantize_per_channel_cpu(
+  Tensor q = at::native::quantize_per_channel(
       tensor, scales, zero_points, ch_axis, kQUInt8);
   auto* q_data = (uint8_t*)q.data_ptr<quint8>();
   for (int c = 0, i = 0; c < C; ++c) {
@@ -199,7 +199,7 @@ TEST(TestQTensor, QuantizePerChannel4dChannelsLast) {
   }
 
   // quantize and check values
-  Tensor q = at::native::quantize_per_channel_cpu(
+  Tensor q = at::native::quantize_per_channel(
       tensor, scales, zero_points, ch_axis, kQUInt8);
   auto* q_data = (uint8_t*)q.data_ptr<quint8>();
   for (int e = 0, i = 0; e < H * W; ++e) {
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index d90dbb63930a0..e365fe7f73a18 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -399,7 +399,7 @@ def test_compare_per_tensor_device_numerics(self):
             qtr_cuda = torch.quantize_per_tensor(r.to(device), scale, zero_point, dtype)
             dqtr_cuda = qtr_cuda.dequantize()
             self.assertEqual(qtr.int_repr(), qtr_cuda.int_repr())
-            self.assertTrue(np.allclose(dqtr.numpy(), dqtr_cuda.cpu().numpy()))
+            self.assertTrue(np.allclose(dqtr, dqtr_cuda.cpu()))
 
     @unittest.skipIf(not torch.cuda.is_available() or TEST_WITH_ROCM, 'CUDA is not available')
     def test_compare_per_channel_device_numerics(self):

From a74516d6999d08c13f5e7acc90fb54d94ea38130 Mon Sep 17 00:00:00 2001
From: Don Jang <djang@fb.com>
Date: Thu, 8 Jul 2021 17:28:19 -0700
Subject: [PATCH 002/122] [static runtime] implement aten::log (#61393)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61393

Test Plan:
Added `StaticRuntime.IndividualOps_Log`

```
...
[ RUN      ] StaticRuntime.IndividualOps_Log
V0701 12:10:50.829100 3708165 impl.cpp:455] StaticModuleOptions: cleanup_activations 1, enable_out_variant 1, optimize_memory1, optimize_graph_output_memory0
V0701 12:10:50.888468 3708165 impl.cpp:1279] Switch to out variant for node: %3 : Tensor = aten::log(%inp.1)
V0701 12:10:50.889098 3708165 impl.cpp:1279] Switch to out variant for node: %a.1 : Tensor = aten::clone(%3, %2)
```

Reviewed By: hlu1

Differential Revision: D29511622

fbshipit-source-id: 819fd7d90c084609a060efeadb3015e35acac517
---
 benchmarks/static_runtime/test_scripts.h        |  6 ++++++
 .../static_runtime/test_static_runtime.cc       | 10 ++++++++++
 torch/csrc/jit/runtime/static/ops.cpp           | 17 +++++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index 683cb9e4908c1..f24060ae0b078 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -341,6 +341,12 @@ const auto div_scalar_mode = R"JIT(
       return torch.div(a, b, rounding_mode=c).clone()
 )JIT";
 
+const auto log_tensor = R"JIT(
+  def forward(self, inp: Tensor):
+      a = torch.log(inp).clone()
+      return (a)
+)JIT";
+
 const auto sub_tensor = R"JIT(
   def forward(self, a: Tensor, b: Tensor):
       return torch.sub(a, b).clone()
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 2747172c44619..01935427ec3dd 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -426,6 +426,16 @@ TEST(StaticRuntime, IndividualOps_Div) {
   testStaticRuntime(div_scalar_mode, args3, {a, 1.5, "trunc"});
 }
 
+TEST(StaticRuntime, IndividualOps_Log) {
+  // Ensure that the input values are valid.
+  auto a = at::abs(at::randn({2, 3}));
+  auto b = at::abs(at::randn({4, 3, 2}));
+
+  std::vector<IValue> args{a};
+  testStaticRuntime(log_tensor, args);
+  testStaticRuntime(log_tensor, args, {b});
+}
+
 TEST(StaticRuntime, IndividualOps_Sub) {
   auto a = at::randn({2, 3});
   auto b = at::randn({2, 3});
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 906220303e546..aa045fd6cbf34 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1478,6 +1478,23 @@ REGISTER_OPERATOR_FUNCTOR(aten::div, aten_div, [](Node* n) -> SROperator {
   };
 });
 
+REGISTER_OPERATOR_FUNCTOR(aten::log, aten_log, [](Node* n) -> SROperator {
+  if (!n->matches(torch::schema("aten::log.Tensor(Tensor input) -> Tensor"))) {
+    LogAndDumpSchema(n);
+    return nullptr;
+  }
+  return [](ProcessedNode* p_node) {
+    const auto& in0_t = p_node->Input(0).toTensor();
+    if (p_node->Output(0).isNone()) {
+      p_node->Output(0) = create_empty_from(in0_t);
+    }
+    auto& out_t = p_node->Output(0).toTensor();
+    fastResizeToZero(out_t);
+
+    at::cpu::log_out(out_t, in0_t);
+  };
+});
+
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 REGISTER_OPERATOR_FUNCTOR(aten::sub, aten_sub, [](Node* n) -> SROperator {
   if (!n->matches(torch::schema(

From 5fbc853c5fbf63c5b0c787c6759e743ca482dd01 Mon Sep 17 00:00:00 2001
From: Lily Johnson <lillianjohnson@fb.com>
Date: Thu, 8 Jul 2021 17:35:04 -0700
Subject: [PATCH 003/122] [package] PackageExporter remove verbose mode
 (#61145)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61145

Remove 'verbose' mode from PackageExporter as people have complained that it is not useful.

Test Plan: Imported from OSS

Reviewed By: suo

Differential Revision: D29559681

Pulled By: Lilyjjo

fbshipit-source-id: eadb1a3a25fadc64119334a09bf1fa4b355b1edd
---
 docs/source/package.rst               |  6 ++--
 test/package/test_dependency_api.py   | 30 +++++++++----------
 test/package/test_dependency_hooks.py | 10 +++----
 test/package/test_directory_reader.py | 16 +++++-----
 test/package/test_importer.py         |  8 ++---
 test/package/test_mangling.py         |  2 +-
 test/package/test_misc.py             | 14 ++++-----
 test/package/test_model.py            | 17 ++++-------
 test/package/test_package_fx.py       | 12 ++++----
 test/package/test_package_script.py   |  8 ++---
 test/package/test_resources.py        |  8 ++---
 test/package/test_save_load.py        | 20 ++++++-------
 test/package/test_torchscript.py      | 42 +++++++++++++--------------
 torch/package/package_exporter.py     | 35 ----------------------
 14 files changed, 94 insertions(+), 134 deletions(-)

diff --git a/docs/source/package.rst b/docs/source/package.rst
index ea303037cab52..c9bdd54906d17 100644
--- a/docs/source/package.rst
+++ b/docs/source/package.rst
@@ -100,7 +100,7 @@ use the glob-style ``include`` and ``exclude`` filtering arguments.
 
 ::
 
-    with PackageExporter('my_package.pt', verbose=False) as pe:
+    with PackageExporter('my_package.pt') as pe:
         pe.save_pickle('models', 'model_1.pkl', mod)
         # can limit printed items with include/exclude args
         print(pe.file_structure(include=["**/utils.py", "**/*.pkl"], exclude="**/*.storages"))
@@ -250,7 +250,7 @@ Steps:
 
     foo_1 = foo.Foo("foo_1 initial string")
     foo_2 = foo.Foo("foo_2 initial string")
-    with PackageExporter('foo_package.pt', verbose=False) as pe:
+    with PackageExporter('foo_package.pt') as pe:
         # save as normal, no extra work necessary
         pe.save_pickle('foo_collection', 'foo1.pkl', foo_1)
         pe.save_pickle('foo_collection', 'foo2.pkl', foo_2)
@@ -452,7 +452,7 @@ Saving TorchScript objects that are attributes or submodules is supported as wel
 ::
 
     # save TorchScript just like any other object
-    with PackageExporter(file_name, verbose=True) as e:
+    with PackageExporter(file_name) as e:
         e.save_pickle("res", "script_model.pkl", scripted_model)
         e.save_pickle("res", "mixed_model.pkl", python_model_with_scripted_submodule)
     # load as normal
diff --git a/test/package/test_dependency_api.py b/test/package/test_dependency_api.py
index bd002c2c599a3..6ec178c72cd32 100644
--- a/test/package/test_dependency_api.py
+++ b/test/package/test_dependency_api.py
@@ -24,7 +24,7 @@ class TestDependencyAPI(PackageTestCase):
 
     def test_extern(self):
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as he:
+        with PackageExporter(buffer) as he:
             he.extern(["package_a.subpackage", "module_a"])
             he.save_source_string("foo", "import package_a.subpackage; import module_a")
         buffer.seek(0)
@@ -42,7 +42,7 @@ def test_extern(self):
 
     def test_extern_glob(self):
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as he:
+        with PackageExporter(buffer) as he:
             he.extern(["package_a.*", "module_*"])
             he.save_module("package_a")
             he.save_source_string(
@@ -76,7 +76,7 @@ def test_extern_glob_allow_empty(self):
 
         buffer = BytesIO()
         with self.assertRaisesRegex(EmptyMatchError, r"did not match any modules"):
-            with PackageExporter(buffer, verbose=False) as exporter:
+            with PackageExporter(buffer) as exporter:
                 exporter.extern(include=["package_b.*"], allow_empty=False)
                 exporter.save_module("package_a.subpackage")
 
@@ -87,7 +87,7 @@ def test_deny(self):
         buffer = BytesIO()
 
         with self.assertRaisesRegex(PackagingError, "denied"):
-            with PackageExporter(buffer, verbose=False) as exporter:
+            with PackageExporter(buffer) as exporter:
                 exporter.deny(["package_a.subpackage", "module_a"])
                 exporter.save_source_string("foo", "import package_a.subpackage")
 
@@ -97,7 +97,7 @@ def test_deny_glob(self):
         """
         buffer = BytesIO()
         with self.assertRaises(PackagingError):
-            with PackageExporter(buffer, verbose=False) as exporter:
+            with PackageExporter(buffer) as exporter:
                 exporter.deny(["package_a.*", "module_*"])
                 exporter.save_source_string(
                     "test_module",
@@ -112,7 +112,7 @@ def test_deny_glob(self):
     @skipIf(version_info < (3, 7), "mock uses __getattr__ a 3.7 feature")
     def test_mock(self):
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as he:
+        with PackageExporter(buffer) as he:
             he.mock(["package_a.subpackage", "module_a"])
             # Import something that dependso n package_a.subpackage
             he.save_source_string("foo", "import package_a.subpackage")
@@ -133,7 +133,7 @@ def test_mock(self):
     @skipIf(version_info < (3, 7), "mock uses __getattr__ a 3.7 feature")
     def test_mock_glob(self):
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as he:
+        with PackageExporter(buffer) as he:
             he.mock(["package_a.*", "module*"])
             he.save_module("package_a")
             he.save_source_string(
@@ -168,7 +168,7 @@ def test_mock_glob_allow_empty(self):
 
         buffer = BytesIO()
         with self.assertRaisesRegex(EmptyMatchError, r"did not match any modules"):
-            with PackageExporter(buffer, verbose=False) as exporter:
+            with PackageExporter(buffer) as exporter:
                 exporter.mock(include=["package_b.*"], allow_empty=False)
                 exporter.save_module("package_a.subpackage")
 
@@ -180,7 +180,7 @@ def test_pickle_mocked(self):
         obj2 = package_a.PackageAObject(obj)
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as he:
+        with PackageExporter(buffer) as he:
             he.mock(include="package_a.subpackage")
             he.intern("**")
             he.save_pickle("obj", "obj.pkl", obj2)
@@ -195,7 +195,7 @@ def test_allow_empty_with_error(self):
         """If an error occurs during packaging, it should not be shadowed by the allow_empty error."""
         buffer = BytesIO()
         with self.assertRaises(ModuleNotFoundError):
-            with PackageExporter(buffer, verbose=False) as pe:
+            with PackageExporter(buffer) as pe:
                 # Even though we did not extern a module that matches this
                 # pattern, we want to show the save_module error, not the allow_empty error.
 
@@ -212,7 +212,7 @@ def test_implicit_intern(self):
         import package_a  # noqa: F401
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as he:
+        with PackageExporter(buffer) as he:
             he.save_module("package_a")
 
     def test_intern_error(self):
@@ -225,7 +225,7 @@ def test_intern_error(self):
         buffer = BytesIO()
 
         with self.assertRaises(PackagingError) as e:
-            with PackageExporter(buffer, verbose=False) as he:
+            with PackageExporter(buffer) as he:
                 he.save_pickle("obj", "obj.pkl", obj2)
 
         self.assertEqual(
@@ -240,7 +240,7 @@ def test_intern_error(self):
         )
 
         # Interning all dependencies should work
-        with PackageExporter(buffer, verbose=False) as he:
+        with PackageExporter(buffer) as he:
             he.intern(["package_a", "package_a.subpackage"])
             he.save_pickle("obj", "obj.pkl", obj2)
 
@@ -272,7 +272,7 @@ def import_module(self, module_name):
 
         with self.assertRaises(PackagingError) as e:
             with PackageExporter(
-                buffer, verbose=False, importer=BrokenImporter()
+                buffer, importer=BrokenImporter()
             ) as exporter:
                 exporter.intern(["foo", "bar"])
                 exporter.save_source_string("my_module", "import foo; import bar")
@@ -292,7 +292,7 @@ def test_invalid_import(self):
         """An incorrectly-formed import should raise a PackagingError."""
         buffer = BytesIO()
         with self.assertRaises(PackagingError) as e:
-            with PackageExporter(buffer, verbose=False) as exporter:
+            with PackageExporter(buffer) as exporter:
                 # This import will fail to load.
                 exporter.save_source_string("foo", "from ........ import lol")
 
diff --git a/test/package/test_dependency_hooks.py b/test/package/test_dependency_hooks.py
index 59215c3b85f67..0541d6a77e2e8 100644
--- a/test/package/test_dependency_hooks.py
+++ b/test/package/test_dependency_hooks.py
@@ -26,7 +26,7 @@ def test_single_hook(self):
         def my_extern_hook(package_exporter, module_name):
             my_externs.add(module_name)
 
-        with PackageExporter(buffer, verbose=False) as exporter:
+        with PackageExporter(buffer) as exporter:
             exporter.extern(["package_a.subpackage", "module_a"])
             exporter.register_extern_hook(my_extern_hook)
             exporter.save_source_string("foo", "import module_a")
@@ -45,7 +45,7 @@ def my_extern_hook(package_exporter, module_name):
         def my_extern_hook2(package_exporter, module_name):
             my_externs.remove(module_name)
 
-        with PackageExporter(buffer, verbose=False) as exporter:
+        with PackageExporter(buffer) as exporter:
             exporter.extern(["package_a.subpackage", "module_a"])
             exporter.register_extern_hook(my_extern_hook)
             exporter.register_extern_hook(my_extern_hook2)
@@ -65,7 +65,7 @@ def my_mock_hook(package_exporter, module_name):
         def my_mock_hook2(package_exporter, module_name):
             my_mocks.remove(module_name)
 
-        with PackageExporter(buffer, verbose=False) as exporter:
+        with PackageExporter(buffer) as exporter:
             exporter.mock(["package_a.subpackage", "module_a"])
             exporter.register_mock_hook(my_mock_hook)
             exporter.register_mock_hook(my_mock_hook2)
@@ -85,7 +85,7 @@ def my_extern_hook(package_exporter, module_name):
         def my_extern_hook2(package_exporter, module_name):
             my_externs2.add(module_name)
 
-        with PackageExporter(buffer, verbose=False) as exporter:
+        with PackageExporter(buffer) as exporter:
             exporter.extern(["package_a.subpackage", "module_a"])
             handle = exporter.register_extern_hook(my_extern_hook)
             exporter.register_extern_hook(my_extern_hook2)
@@ -107,7 +107,7 @@ def my_extern_hook(package_exporter, module_name):
         def my_mock_hook(package_exporter, module_name):
             my_mocks.add(module_name)
 
-        with PackageExporter(buffer, verbose=False) as exporter:
+        with PackageExporter(buffer) as exporter:
             exporter.extern("module_a")
             exporter.mock("package_a")
             exporter.register_extern_hook(my_extern_hook)
diff --git a/test/package/test_directory_reader.py b/test/package/test_directory_reader.py
index a9e8cf07c1d97..93968d6e1bf92 100644
--- a/test/package/test_directory_reader.py
+++ b/test/package/test_directory_reader.py
@@ -50,7 +50,7 @@ def test_loading_pickle(self):
         resnet = resnet18()
 
         filename = self.temp()
-        with PackageExporter(filename, verbose=False) as e:
+        with PackageExporter(filename) as e:
             e.intern("**")
             e.save_pickle("model", "model.pkl", resnet)
 
@@ -70,7 +70,7 @@ def test_loading_module(self):
         import package_a
 
         filename = self.temp()
-        with PackageExporter(filename, verbose=False) as e:
+        with PackageExporter(filename) as e:
             e.save_module("package_a")
 
         zip_file = zipfile.ZipFile(filename, "r")
@@ -88,7 +88,7 @@ def test_loading_has_record(self):
         import package_a  # noqa: F401
 
         filename = self.temp()
-        with PackageExporter(filename, verbose=False) as e:
+        with PackageExporter(filename) as e:
             e.save_module("package_a")
 
         zip_file = zipfile.ZipFile(filename, "r")
@@ -103,7 +103,7 @@ def test_loading_has_record(self):
     def test_resource_reader(self):
         """Tests DirectoryReader as the base for get_resource_reader."""
         filename = self.temp()
-        with PackageExporter(filename, verbose=False) as pe:
+        with PackageExporter(filename) as pe:
             # Layout looks like:
             #    package
             #    ├── one/
@@ -185,7 +185,7 @@ def secret_message():
             """
         )
         filename = self.temp()
-        with PackageExporter(filename, verbose=False) as pe:
+        with PackageExporter(filename) as pe:
             pe.save_source_string("foo.bar", mod_src)
             pe.save_text("my_cool_resources", "sekrit.txt", "my sekrit plays")
 
@@ -202,7 +202,7 @@ def secret_message():
     @skipIf(version_info < (3, 7), "ResourceReader API introduced in Python 3.7")
     def test_importer_access(self):
         filename = self.temp()
-        with PackageExporter(filename, verbose=False) as he:
+        with PackageExporter(filename) as he:
             he.save_text("main", "main", "my string")
             he.save_binary("main", "main_binary", "my string".encode("utf-8"))
             src = dedent(
@@ -231,7 +231,7 @@ def test_resource_access_by_path(self):
         Tests that packaged code can used importlib.resources.path.
         """
         filename = self.temp()
-        with PackageExporter(filename, verbose=False) as e:
+        with PackageExporter(filename) as e:
             e.save_binary("string_module", "my_string", "my string".encode("utf-8"))
             src = dedent(
                 """\
@@ -263,7 +263,7 @@ def test_scriptobject_failure_message(self):
         scripted_mod = torch.jit.script(ModWithTensor(torch.rand(1, 2, 3)))
 
         filename = self.temp()
-        with PackageExporter(filename, verbose=False) as e:
+        with PackageExporter(filename) as e:
             e.save_pickle("res", "mod.pkl", scripted_mod)
 
         zip_file = zipfile.ZipFile(filename, "r")
diff --git a/test/package/test_importer.py b/test/package/test_importer.py
index 432392d431498..683739265166b 100644
--- a/test/package/test_importer.py
+++ b/test/package/test_importer.py
@@ -45,7 +45,7 @@ def test_single_ordered_importer(self):
         import package_a
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as pe:
+        with PackageExporter(buffer) as pe:
             pe.save_module(package_a.__name__)
 
         buffer.seek(0)
@@ -71,7 +71,7 @@ def test_ordered_importer_basic(self):
         import package_a
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as pe:
+        with PackageExporter(buffer) as pe:
             pe.save_module(package_a.__name__)
 
         buffer.seek(0)
@@ -135,7 +135,7 @@ def test_package_importer_whichmodule_no_dunder_module(self):
 
         # Set up a PackageImporter which has a torch.float16 object pickled:
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as exporter:
+        with PackageExporter(buffer) as exporter:
             exporter.save_pickle("foo", "foo.pkl", my_dtype)
         buffer.seek(0)
 
@@ -144,7 +144,7 @@ def test_package_importer_whichmodule_no_dunder_module(self):
 
         # Re-save a package with only our PackageImporter as the importer
         buffer2 = BytesIO()
-        with PackageExporter(buffer2, verbose=False, importer=importer) as exporter:
+        with PackageExporter(buffer2, importer=importer) as exporter:
             exporter.save_pickle("foo", "foo.pkl", my_loaded_dtype)
 
         buffer2.seek(0)
diff --git a/test/package/test_mangling.py b/test/package/test_mangling.py
index ae8a7eb1562ea..7db192865348b 100644
--- a/test/package/test_mangling.py
+++ b/test/package/test_mangling.py
@@ -81,7 +81,7 @@ def test_unique_module_names(self):
         obj = package_a.subpackage.PackageASubpackageObject()
         obj2 = package_a.PackageAObject(obj)
         f1 = BytesIO()
-        with PackageExporter(f1, verbose=False) as pe:
+        with PackageExporter(f1) as pe:
             pe.intern("**")
             pe.save_pickle("obj", "obj.pkl", obj2)
         f1.seek(0)
diff --git a/test/package/test_misc.py b/test/package/test_misc.py
index 76ff2e0bd964a..ae8004f091d96 100644
--- a/test/package/test_misc.py
+++ b/test/package/test_misc.py
@@ -63,7 +63,7 @@ def test_file_structure(self):
             """
         )
 
-        with PackageExporter(buffer, verbose=False) as he:
+        with PackageExporter(buffer) as he:
             import module_a
             import package_a
             import package_a.subpackage
@@ -101,7 +101,7 @@ def test_file_structure_has_file(self):
         Test Directory's has_file() method.
         """
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as he:
+        with PackageExporter(buffer) as he:
             import package_a.subpackage
 
             he.intern("**")
@@ -122,7 +122,7 @@ def test_is_from_package(self):
         buffer = BytesIO()
         obj = package_a.subpackage.PackageASubpackageObject()
 
-        with PackageExporter(buffer, verbose=False) as pe:
+        with PackageExporter(buffer) as pe:
             pe.intern("**")
             pe.save_pickle("obj", "obj.pkl", obj)
 
@@ -144,7 +144,7 @@ def test_inspect_class(self):
         buffer = BytesIO()
         obj = package_a.subpackage.PackageASubpackageObject()
 
-        with PackageExporter(buffer, verbose=False) as pe:
+        with PackageExporter(buffer) as pe:
             pe.intern("**")
             pe.save_pickle("obj", "obj.pkl", obj)
 
@@ -168,7 +168,7 @@ def test_dunder_package_present(self):
         buffer = BytesIO()
         obj = package_a.subpackage.PackageASubpackageObject()
 
-        with PackageExporter(buffer, verbose=False) as pe:
+        with PackageExporter(buffer) as pe:
             pe.intern("**")
             pe.save_pickle("obj", "obj.pkl", obj)
 
@@ -187,7 +187,7 @@ def test_dunder_package_works_from_package(self):
 
         buffer = BytesIO()
 
-        with PackageExporter(buffer, verbose=False) as pe:
+        with PackageExporter(buffer) as pe:
             pe.intern("**")
             pe.save_module(mod.__name__)
 
@@ -208,7 +208,7 @@ def test_std_lib_sys_hackery_checks(self):
         buffer = BytesIO()
         mod = package_a.std_sys_module_hacks.Module()
 
-        with PackageExporter(buffer, verbose=False) as pe:
+        with PackageExporter(buffer) as pe:
             pe.intern("**")
             pe.save_pickle("obj", "obj.pkl", mod)
 
diff --git a/test/package/test_model.py b/test/package/test_model.py
index 8292b39947f52..f5e08b6bfa83c 100644
--- a/test/package/test_model.py
+++ b/test/package/test_model.py
@@ -1,4 +1,4 @@
-from io import BytesIO, StringIO
+from io import BytesIO
 from textwrap import dedent
 from unittest import skipIf
 
@@ -35,18 +35,13 @@ def test_resnet(self):
         f1 = self.temp()
 
         # create a package that will save it along with its code
-        with PackageExporter(f1, verbose=False) as e:
+        with PackageExporter(f1) as e:
             # put the pickled resnet in the package, by default
             # this will also save all the code files references by
             # the objects in the pickle
             e.intern("**")
             e.save_pickle("model", "model.pkl", resnet)
 
-            # check th debug graph has something reasonable:
-            buf = StringIO()
-            debug_graph = e._write_dep_graph(failing_module="torch")
-            self.assertIn("torchvision.models.resnet", debug_graph)
-
         # we can now load the saved model
         i = PackageImporter(f1)
         r2 = i.load_pickle("model", "model.pkl")
@@ -66,7 +61,7 @@ def test_resnet(self):
         # came from imported packages so that it can resolve
         # class names like torchvision.models.resnet.ResNet
         # to their source code.
-        with PackageExporter(f2, verbose=False, importer=(i, sys_importer)) as e:
+        with PackageExporter(f2, importer=(i, sys_importer)) as e:
             # e.importers is a list of module importing functions
             # that by default contains importlib.import_module.
             # it is searched in order until the first success and
@@ -110,7 +105,7 @@ def test_model_save(self):
         # Option 1: save by pickling the whole model
         # + single-line, similar to torch.jit.save
         # - more difficult to edit the code after the model is created
-        with PackageExporter(f1, verbose=False) as e:
+        with PackageExporter(f1) as e:
             e.intern("**")
             e.save_pickle("model", "pickled", resnet)
             # note that this source is the same for all models in this approach
@@ -133,7 +128,7 @@ def load():
         # Option 2: save with state dict
         # - more code to write to save/load the model
         # + but this code can be edited later to adjust adapt the model later
-        with PackageExporter(f2, verbose=False) as e:
+        with PackageExporter(f2) as e:
             e.intern("**")
             e.save_pickle("model", "state_dict", resnet.state_dict())
             src = dedent(
@@ -174,7 +169,7 @@ def test_script_resnet(self):
         # Option 1: save by pickling the whole model
         # + single-line, similar to torch.jit.save
         # - more difficult to edit the code after the model is created
-        with PackageExporter(f1, verbose=False) as e:
+        with PackageExporter(f1) as e:
             e.intern("**")
             e.save_pickle("model", "pickled", resnet)
 
diff --git a/test/package/test_package_fx.py b/test/package/test_package_fx.py
index 6aede766279fb..7ad05a105e2d6 100644
--- a/test/package/test_package_fx.py
+++ b/test/package/test_package_fx.py
@@ -29,7 +29,7 @@ def forward(self, x):
         traced = symbolic_trace(st)
 
         f = BytesIO()
-        with PackageExporter(f, verbose=False) as pe:
+        with PackageExporter(f) as pe:
             pe.save_pickle("model", "model.pkl", traced)
 
         f.seek(0)
@@ -43,7 +43,7 @@ def test_package_then_fx(self):
 
         model = SimpleTest()
         f = BytesIO()
-        with PackageExporter(f, verbose=False) as pe:
+        with PackageExporter(f) as pe:
             pe.intern("**")
             pe.save_pickle("model", "model.pkl", model)
 
@@ -59,7 +59,7 @@ def test_package_fx_package(self):
 
         model = SimpleTest()
         f = BytesIO()
-        with PackageExporter(f, verbose=False) as pe:
+        with PackageExporter(f) as pe:
             pe.intern("**")
             pe.save_pickle("model", "model.pkl", model)
 
@@ -73,12 +73,12 @@ def test_package_fx_package(self):
         # This should fail, because we are referencing some globals that are
         # only in the package.
         with self.assertRaises(ObjMismatchError):
-            with PackageExporter(f2, verbose=False) as pe:
+            with PackageExporter(f2) as pe:
                 pe.intern("**")
                 pe.save_pickle("model", "model.pkl", traced)
 
         f2.seek(0)
-        with PackageExporter(f2, importer=(pi, sys_importer), verbose=False) as pe:
+        with PackageExporter(f2, importer=(pi, sys_importer)) as pe:
             # Make the package available to the exporter's environment.
             pe.intern("**")
             pe.save_pickle("model", "model.pkl", traced)
@@ -102,7 +102,7 @@ def test_package_fx_with_imports(self):
         gm = GraphModule(torch.nn.Module(), graph)
 
         f = BytesIO()
-        with PackageExporter(f, verbose=False) as pe:
+        with PackageExporter(f) as pe:
             pe.intern("**")
             pe.save_pickle("model", "model.pkl", gm)
         f.seek(0)
diff --git a/test/package/test_package_script.py b/test/package/test_package_script.py
index 2e81cfb82c7dd..a44ba62a2c306 100644
--- a/test/package/test_package_script.py
+++ b/test/package/test_package_script.py
@@ -28,7 +28,7 @@ def test_package_interface(self):
         scripted.proxy_mod = torch.jit.script(fake.NewModule())
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as pe:
+        with PackageExporter(buffer) as pe:
             pe.intern("**")
             pe.save_pickle("model", "model.pkl", uses_interface)
         buffer.seek(0)
@@ -54,7 +54,7 @@ def test_different_package_interface(self):
         # Simulate a package that contains a different version of the
         # interface, with the exact same name.
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as pe:
+        with PackageExporter(buffer) as pe:
             pe.save_source_string(
                 fake.__name__,
                 dedent(
@@ -94,7 +94,7 @@ def test_package_script_class(self):
         import package_a.fake_script_class as fake
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as pe:
+        with PackageExporter(buffer) as pe:
             pe.save_module(fake.__name__)
         buffer.seek(0)
 
@@ -118,7 +118,7 @@ def test_different_package_script_class(self):
         # Simulate a package that contains a different version of the
         # script class ,with the attribute `bar` instead of `foo`
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as pe2:
+        with PackageExporter(buffer) as pe2:
             pe2.save_source_string(
                 fake.__name__,
                 dedent(
diff --git a/test/package/test_resources.py b/test/package/test_resources.py
index 204d5ed50227f..1dcfa4cd3e3c7 100644
--- a/test/package/test_resources.py
+++ b/test/package/test_resources.py
@@ -21,7 +21,7 @@ class TestResources(PackageTestCase):
     def test_resource_reader(self):
         """Test compliance with the get_resource_reader importlib API."""
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as pe:
+        with PackageExporter(buffer) as pe:
             # Layout looks like:
             #    package
             #    ├── one/
@@ -89,7 +89,7 @@ def secret_message():
             """
         )
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as pe:
+        with PackageExporter(buffer) as pe:
             pe.save_source_string("foo.bar", mod_src)
             pe.save_text("my_cool_resources", "sekrit.txt", "my sekrit plays")
 
@@ -101,7 +101,7 @@ def secret_message():
 
     def test_importer_access(self):
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as he:
+        with PackageExporter(buffer) as he:
             he.save_text("main", "main", "my string")
             he.save_binary("main", "main_binary", "my string".encode("utf-8"))
             src = dedent(
@@ -125,7 +125,7 @@ def test_resource_access_by_path(self):
         Tests that packaged code can used importlib.resources.path.
         """
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as he:
+        with PackageExporter(buffer) as he:
             he.save_binary("string_module", "my_string", "my string".encode("utf-8"))
             src = dedent(
                 """\
diff --git a/test/package/test_save_load.py b/test/package/test_save_load.py
index e50b42fa5e48d..1798c02270d4b 100644
--- a/test/package/test_save_load.py
+++ b/test/package/test_save_load.py
@@ -26,7 +26,7 @@ class TestSaveLoad(PackageTestCase):
     )
     def test_saving_source(self):
         filename = self.temp()
-        with PackageExporter(filename, verbose=False) as he:
+        with PackageExporter(filename) as he:
             he.save_source_file("foo", str(packaging_directory / "module_a.py"))
             he.save_source_file("foodir", str(packaging_directory / "package_a"))
         hi = PackageImporter(filename)
@@ -41,7 +41,7 @@ def test_saving_source(self):
     )
     def test_saving_string(self):
         filename = self.temp()
-        with PackageExporter(filename, verbose=False) as he:
+        with PackageExporter(filename) as he:
             src = dedent(
                 """\
                 import math
@@ -63,7 +63,7 @@ def test_saving_string(self):
     )
     def test_save_module(self):
         filename = self.temp()
-        with PackageExporter(filename, verbose=False) as he:
+        with PackageExporter(filename) as he:
             import module_a
             import package_a
 
@@ -79,7 +79,7 @@ def test_save_module(self):
 
     def test_dunder_imports(self):
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as he:
+        with PackageExporter(buffer) as he:
             import package_b
 
             obj = package_b.PackageBObject
@@ -110,7 +110,7 @@ def test_dunder_imports(self):
 
     def test_save_module_binary(self):
         f = BytesIO()
-        with PackageExporter(f, verbose=False) as he:
+        with PackageExporter(f) as he:
             import module_a
             import package_a
 
@@ -136,7 +136,7 @@ def test_pickle(self):
         obj2 = package_a.PackageAObject(obj)
 
         filename = self.temp()
-        with PackageExporter(filename, verbose=False) as he:
+        with PackageExporter(filename) as he:
             he.intern("**")
             he.save_pickle("obj", "obj.pkl", obj2)
         hi = PackageImporter(filename)
@@ -167,7 +167,7 @@ def test_save_imported_module_fails(self):
         obj = package_a.subpackage.PackageASubpackageObject()
         obj2 = package_a.PackageAObject(obj)
         f1 = self.temp()
-        with PackageExporter(f1, verbose=False) as pe:
+        with PackageExporter(f1) as pe:
             pe.intern("**")
             pe.save_pickle("obj", "obj.pkl", obj)
 
@@ -175,7 +175,7 @@ def test_save_imported_module_fails(self):
         loaded1 = importer1.load_pickle("obj", "obj.pkl")
 
         f2 = self.temp()
-        pe = PackageExporter(f2, verbose=False, importer=(importer1, sys_importer))
+        pe = PackageExporter(f2, importer=(importer1, sys_importer))
         with self.assertRaisesRegex(ModuleNotFoundError, "torch.package"):
             pe.save_module(loaded1.__module__)
 
@@ -194,7 +194,7 @@ def test_exporting_mismatched_code(self):
         obj = package_a.subpackage.PackageASubpackageObject()
         obj2 = package_a.PackageAObject(obj)
         f1 = self.temp()
-        with PackageExporter(f1, verbose=False) as pe:
+        with PackageExporter(f1) as pe:
             pe.intern("**")
             pe.save_pickle("obj", "obj.pkl", obj2)
 
@@ -206,7 +206,7 @@ def test_exporting_mismatched_code(self):
         f2 = self.temp()
 
         def make_exporter():
-            pe = PackageExporter(f2, verbose=False, importer=[importer1, sys_importer])
+            pe = PackageExporter(f2, importer=[importer1, sys_importer])
             # Ensure that the importer finds the 'PackageAObject' defined in 'importer1' first.
             return pe
 
diff --git a/test/package/test_torchscript.py b/test/package/test_torchscript.py
index 8f6f4d9250441..d240282bfb2f2 100644
--- a/test/package/test_torchscript.py
+++ b/test/package/test_torchscript.py
@@ -41,7 +41,7 @@ def test_save_scriptmodule(self):
         scripted_mod = torch.jit.script(ModWithTensor(torch.rand(1, 2, 3)))
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as e:
+        with PackageExporter(buffer) as e:
             e.save_pickle("res", "mod.pkl", scripted_mod)
 
         buffer.seek(0)
@@ -63,7 +63,7 @@ def test_save_scriptmodule_file(self):
         scripted_mod = torch.jit.script(ModWithTensor(torch.rand(1, 2, 3)))
 
         filename = self.temp()
-        with PackageExporter(filename, verbose=False) as e:
+        with PackageExporter(filename) as e:
             e.save_pickle("res", "mod.pkl", scripted_mod)
 
         importer = PackageImporter(filename)
@@ -82,7 +82,7 @@ def test_save_scriptmodule_with_submods(self):
         )
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as e:
+        with PackageExporter(buffer) as e:
             e.save_pickle("res", "mod.pkl", scripted_mod)
 
         buffer.seek(0)
@@ -130,7 +130,7 @@ def forward(self, input: str):
         scripted_mod_1 = torch.jit.script(TopMod())
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as e:
+        with PackageExporter(buffer) as e:
             e.save_pickle("res", "mod1.pkl", scripted_mod_0)
             e.save_pickle("res", "mod2.pkl", scripted_mod_1)
 
@@ -153,7 +153,7 @@ def test_save_independent_scriptmodules(self):
         scripted_mod_1 = torch.jit.script(ModWithTensor(torch.rand(1, 2, 3)))
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as e:
+        with PackageExporter(buffer) as e:
             e.save_pickle("res", "mod1.pkl", scripted_mod_0)
             e.save_pickle("res", "mod2.pkl", scripted_mod_1)
 
@@ -187,7 +187,7 @@ def test_save_repeat_scriptmodules(self):
         )
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as e:
+        with PackageExporter(buffer) as e:
             e.save_pickle("res", "mod0.pkl", scripted_mod_0)
             e.save_pickle("res", "mod1.pkl", scripted_mod_1)
             e.save_pickle("res", "mod2.pkl", scripted_mod_0)
@@ -219,7 +219,7 @@ def test_scriptmodules_repeat_save(self):
         )
 
         buffer_0 = BytesIO()
-        with PackageExporter(buffer_0, verbose=False) as e:
+        with PackageExporter(buffer_0) as e:
             e.save_pickle("res", "mod1.pkl", scripted_mod_0)
 
         buffer_0.seek(0)
@@ -227,7 +227,7 @@ def test_scriptmodules_repeat_save(self):
         loaded_module_0 = importer_0.load_pickle("res", "mod1.pkl")
 
         buffer_1 = BytesIO()
-        with PackageExporter(buffer_1, verbose=False) as e:
+        with PackageExporter(buffer_1) as e:
             e.save_pickle("res", "mod1.pkl", scripted_mod_1)
             e.save_pickle("res", "mod2.pkl", loaded_module_0)
 
@@ -263,14 +263,14 @@ def forward(self, input):
         scripted_mod_1 = torch.jit.script(ModWithTensor(torch.rand(1, 2, 3)))
 
         buffer_0 = BytesIO()
-        with PackageExporter(buffer_0, verbose=False) as e:
+        with PackageExporter(buffer_0) as e:
             e.save_pickle("res", "mod1.pkl", scripted_mod_0)
 
         buffer_0.seek(0)
         importer_0 = importer = PackageImporter(buffer_0)
 
         buffer_1 = BytesIO()
-        with PackageExporter(buffer_1, verbose=False) as e:
+        with PackageExporter(buffer_1) as e:
             e.save_pickle("res", "mod1.pkl", scripted_mod_1)
 
         buffer_1.seek(0)
@@ -293,7 +293,7 @@ def test_save_scriptmodules_in_container(self):
         script_mods_list = [scripted_mod_a, scripted_mod_b]
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as e:
+        with PackageExporter(buffer) as e:
             e.save_pickle("res", "list.pkl", script_mods_list)
 
         buffer.seek(0)
@@ -317,7 +317,7 @@ def test_save_eager_mods_sharing_scriptmodule(self):
         mod2 = ModWithSubmod(scripted_mod)
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as e:
+        with PackageExporter(buffer) as e:
             e.intern("**")
             e.save_pickle("res", "mod1.pkl", mod1)
             e.save_pickle("res", "mod2.pkl", mod2)
@@ -347,7 +347,7 @@ def test_load_shared_scriptmodules(self):
         mod_parent = ModWithMultipleSubmods(mod1, mod2)
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as e:
+        with PackageExporter(buffer) as e:
             e.intern("**")
             e.save_pickle("res", "mod.pkl", mod_parent)
 
@@ -372,7 +372,7 @@ def test_save_shared_tensors(self):
         mod2 = ModWithSubmodAndTensor(shared_tensor, scripted_mod)
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as e:
+        with PackageExporter(buffer) as e:
             e.intern("**")
             e.save_pickle("res", "tensor", shared_tensor)
             e.save_pickle("res", "mod1.pkl", mod1)
@@ -404,7 +404,7 @@ def test_load_shared_tensors(self):
         mod1 = ModWithTwoSubmodsAndTensor(shared_tensor, scripted_mod_0, scripted_mod_1)
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as e:
+        with PackageExporter(buffer) as e:
             e.intern("**")
             e.save_pickle("res", "mod1.pkl", mod1)
 
@@ -449,7 +449,7 @@ def test_load_shared_tensors_repackaged(self):
         mod1 = ModWithTwoSubmodsAndTensor(shared_tensor, scripted_mod_0, scripted_mod_1)
 
         buffer_0 = BytesIO()
-        with PackageExporter(buffer_0, verbose=False) as e:
+        with PackageExporter(buffer_0) as e:
             e.intern("**")
             e.save_pickle("res", "mod1.pkl", mod1)
 
@@ -458,7 +458,7 @@ def test_load_shared_tensors_repackaged(self):
         loaded_mod_0 = importer_0.load_pickle("res", "mod1.pkl")
 
         buffer_1 = BytesIO()
-        with PackageExporter(buffer_1, importer=importer_0, verbose=False) as e:
+        with PackageExporter(buffer_1, importer=importer_0) as e:
             e.intern("**")
             e.save_pickle("res", "mod1.pkl", loaded_mod_0)
 
@@ -496,7 +496,7 @@ def test_saving_and_scripting_packaged_mod(self):
         orig_mod = SimpleTest()
 
         buffer_0 = BytesIO()
-        with PackageExporter(buffer_0, verbose=False) as e:
+        with PackageExporter(buffer_0) as e:
             e.intern("**")
             e.save_pickle("model", "model.pkl", orig_mod)
 
@@ -510,7 +510,7 @@ def test_saving_and_scripting_packaged_mod(self):
         scripted_mod = torch.jit.script(loaded_mod)
 
         buffer_1 = BytesIO()
-        with PackageExporter(buffer_1, importer=importer_0, verbose=False) as e:
+        with PackageExporter(buffer_1, importer=importer_0) as e:
             e.intern("**")
             e.save_pickle("res", "scripted_mod.pkl", scripted_mod)
 
@@ -545,7 +545,7 @@ def forward(self, input: str):
         scripted_imported = torch.jit.script(imported_mod)
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as e:
+        with PackageExporter(buffer) as e:
             e.save_pickle("model", "inline.pkl", scripted_inline)
             e.save_pickle("model", "imported.pkl", scripted_imported)
 
@@ -586,7 +586,7 @@ def a_non_torch_leaf(a, b):
         scripted_imported = torch.jit.script(imported_mod)
 
         buffer = BytesIO()
-        with PackageExporter(buffer, verbose=False) as e:
+        with PackageExporter(buffer) as e:
             e.save_pickle("model", "inline.pkl", scripted_inline)
             e.save_pickle("model", "imported.pkl", scripted_imported)
 
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 8b9a2e75e2ae0..9d1cf87db7172 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -19,7 +19,6 @@
     Set,
     Union,
 )
-from urllib.parse import quote
 
 import torch
 from torch.serialization import location_tag, normalize_storage_type
@@ -170,7 +169,6 @@ def __init__(
         self,
         f: Union[str, Path, BinaryIO],
         importer: Union[Importer, Sequence[Importer]] = sys_importer,
-        verbose: bool = True,
     ):
         """
         Create an exporter.
@@ -180,8 +178,6 @@ def __init__(
                 or a binary I/O object.
             importer: If a single Importer is passed, use that to search for modules.
                 If a sequence of importers are passsed, an ``OrderedImporter`` will be constructed out of them.
-            verbose: Print information about dependency resolution to stdout.
-                Useful for tracking down why certain files get included.
         """
         if isinstance(f, (Path, str)):
             f = str(f)
@@ -201,7 +197,6 @@ def __init__(
         # - Each directed edge (u, v) means u depends on v.
         # - Nodes may contain metadata that describe how to write the thing to the zipfile.
         self.dependency_graph = DiGraph()
-        self.verbose = verbose
         self.script_module_serializer = torch._C.ScriptModuleSerializer(self.zip_file)
         self.storage_context = self.script_module_serializer.storage_context()
 
@@ -332,10 +327,6 @@ def _get_dependencies(
             if self._module_exists(dep_module_name):
                 dependencies[dep_module_name] = True
 
-        if self.verbose:
-            dep_str = "".join(f"  {dep}\n" for dep in dependencies)
-            print(f"{module_name} depends on:\n{dep_str}\n")
-
         return list(dependencies.keys())
 
     def save_source_string(
@@ -403,20 +394,6 @@ def _module_exists(self, module_name: str) -> bool:
         except Exception:
             return False
 
-    def _write_dep_graph(self, failing_module=None):
-        edges = "\n".join(f'"{f}" -> "{t}";' for f, t in self.dependency_graph.edges)
-        failing = "" if failing_module is None else f'"{failing_module}" [color=red];'
-        template = f"""\
-digraph G {{
-rankdir = LR;
-node [shape=box];
-{failing}
-{edges}
-}}
-"""
-        arg = quote(template, safe="")
-        return f"https://dreampuf.github.io/GraphvizOnline/#{arg}"
-
     def _get_source_of_module(self, module: types.ModuleType) -> Optional[str]:
         filename = getattr(module, "__file__", None)
         result = (
@@ -438,11 +415,6 @@ def require_module_if_not_provided(self, module_name: str, dependencies=True):
             return
 
         if self._can_implicitly_extern(module_name):
-            if self.verbose:
-                print(
-                    f"implicitly adding {module_name} to external modules "
-                    f"since it is part of the standard library and is a dependency."
-                )
             self.dependency_graph.add_node(
                 module_name, action=_ModuleProviderAction.EXTERN, provided=True
             )
@@ -575,10 +547,6 @@ def save_pickle(
                     if module not in all_dependencies:
                         all_dependencies.append(module)
 
-            if self.verbose:
-                dep_string = "".join(f"  {dep}\n" for dep in all_dependencies)
-                print(f"{resource} depends on:\n{dep_string}\n")
-
             for module_name in all_dependencies:
                 self.dependency_graph.add_edge(name_in_dependency_graph, module_name)
                 self.require_module_if_not_provided(module_name)
@@ -921,9 +889,6 @@ def close(self):
             with PackageExporter("file.zip") as e:
                 ...
         """
-        if self.verbose:
-            print(f"Dependency graph for exported package: \n{self._write_dep_graph()}")
-
         self._execute_dependency_graph()
 
         self.script_module_serializer.write_files()

From d52ebf2b1bcf96a3e24e03ea86b59c8cf4a06307 Mon Sep 17 00:00:00 2001
From: Zeina Migeed <migeedz@fb.com>
Date: Thu, 8 Jul 2021 19:42:21 -0700
Subject: [PATCH 004/122] conv2d (#61093)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61093

Test Plan: Imported from OSS

Reviewed By: eellison

Differential Revision: D29562478

Pulled By: migeed-z

fbshipit-source-id: d41f3a9526ee52a9571cb861be03bf9ae176a373
---
 test/fx/test_gradual_type.py                  | 165 ++++++++++++++++++
 .../experimental/graph_gradual_typechecker.py |  62 ++++++-
 2 files changed, 226 insertions(+), 1 deletion(-)

diff --git a/test/fx/test_gradual_type.py b/test/fx/test_gradual_type.py
index 8b5ece9b43b4b..813fab259632d 100644
--- a/test/fx/test_gradual_type.py
+++ b/test/fx/test_gradual_type.py
@@ -7,6 +7,11 @@
 from torch.fx.experimental.rewriter import RewritingTracer
 from torch.fx import GraphModule
 
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return torch.nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                           padding=dilation, groups=groups, bias=False, dilation=dilation)
+
 class AnnotationsTest(unittest.TestCase):
 
     def test_annotations(self):
@@ -330,6 +335,166 @@ def forward(self, x: Dyn):
         with self.assertRaises(TypeError):
             tc.type_check()
 
+    def test_type_check_conv2D(self):
+        class BasicBlock(torch.nn.Module):
+            def __init__(self, inplanes, planes, stride=1, norm_layer=None):
+                super(BasicBlock, self).__init__()
+                if norm_layer is None:
+                    norm_layer = torch.nn.BatchNorm2d
+                self.conv1 = conv3x3(inplanes, planes, stride)
+                self.bn1 = norm_layer(planes)
+
+            def forward(self, x: Dyn):
+                identity = x
+                out: TensorType((2, 2, Dyn, 4)) = self.conv1(x)
+                out += identity
+                return out
+
+        B = BasicBlock(2, 2)
+        ast_rewriter = RewritingTracer()
+        graph = ast_rewriter.trace(B)
+        traced = GraphModule(ast_rewriter.root, graph, "gm")
+        tc = GraphTypeChecker({}, traced)
+        tc.type_check()
+        for n in graph.nodes:
+            if n.op == 'placeholder':
+                assert n.type == TensorType((Dyn, Dyn, Dyn, Dyn))
+            if n.op == 'call_function':
+                assert n.type == TensorType((Dyn, Dyn, Dyn, Dyn))
+            if n.op == 'output':
+                assert n.type == TensorType((Dyn, Dyn, Dyn, Dyn))
+            if n.op == 'call_module':
+                assert n.type == TensorType((2, 2, Dyn, 4))
+
+    def test_type_check_conv2D_2(self):
+        class BasicBlock(torch.nn.Module):
+            def __init__(self, inplanes, planes, stride=1, norm_layer=None):
+                super(BasicBlock, self).__init__()
+                if norm_layer is None:
+                    norm_layer = torch.nn.BatchNorm2d
+                self.conv1 = conv3x3(inplanes, planes, stride)
+                self.bn1 = norm_layer(planes)
+
+            def forward(self, x: TensorType((5, 2, 3, 4))):
+                identity = x
+                out = self.conv1(x)
+                out += identity
+                return out
+
+        B = BasicBlock(2, 2)
+        b = B.forward(torch.rand(5, 2, 3, 4))
+
+        ast_rewriter = RewritingTracer()
+        graph = ast_rewriter.trace(B)
+        traced = GraphModule(ast_rewriter.root, graph, "gm")
+        tc = GraphTypeChecker({}, traced)
+        tc.type_check()
+        t = TensorType((5, 2, 3, 4))
+        for n in graph.nodes:
+            if n.op == 'placeholder':
+                assert n.type == t
+            if n.op == 'call_function':
+                assert n.type == t
+            if n.op == 'output':
+                assert torch.Size(n.type.__args__) == b.shape
+            if n.op == 'call_module':
+                assert n.type == t
+
+        B = BasicBlock(1, 2)
+        ast_rewriter = RewritingTracer()
+        graph = ast_rewriter.trace(B)
+        traced = GraphModule(ast_rewriter.root, graph, "gm")
+        tc = GraphTypeChecker({}, traced)
+        with self.assertRaises(TypeError):
+            tc.type_check()
+
+    def test_type_check_conv2D_2_fully_static(self):
+        annotation_list = [(1, 2, 3, 5), (2, 5, 6, 9), (10, 15, 13, 14),
+                           (10, Dyn, 13, 14), (Dyn, Dyn, Dyn, 3)]
+        input_list = [(1, 2, 3, 5), (2, 5, 6, 9), (10, 15, 13, 14),
+                      (10, 15, 13, 14), (1, 2, 2, 3)]
+        intermediate_types = [(1, Dyn, Dyn, 7), (2, Dyn, 4, 6), (10, 15, Dyn, 5),
+                              (10, 15, 7, 7), (1, Dyn, Dyn, Dyn)]
+        in_planes_list = [2, 5, 15, 15, 2]
+        stride_list = [1, 2, 3, 2, 2]
+        out_planes_list = [2, 5, 15, 15, 2]
+        groups_list = [1, 5, 5, 5, 2]
+        dilation_list = [1, 2, 3, 3, 3]
+        padding_list = [1, 2, 3, 3, 3]
+        kernel_size_list = [1, 2, 3, 3, 3]
+        output_types = [(1, 2, Dyn, 7), (2, 5, 4, 6), (10, 15, Dyn, 5), (10, 15, 7, 7), (1, 2, Dyn, Dyn)]
+
+        for i in range(5):
+            annotation = annotation_list[i]
+            input = input_list[i]
+            in_planes = in_planes_list[i]
+            stride = stride_list[i]
+            out_planes = out_planes_list[i]
+            groups = groups_list[i]
+            dilation = dilation_list[i]
+            padding = padding_list[i]
+            kernel_size = kernel_size_list[i]
+            intermediate_type = intermediate_types[i]
+
+            class BasicBlock(torch.nn.Module):
+                def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+                    super(BasicBlock, self).__init__()
+                    self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
+                                                 kernel_size=kernel_size, stride=stride,
+                                                 padding=padding, groups=groups, bias=False, dilation=dilation)
+
+                def forward(self, x):
+                    out = self.conv1(x)
+                    return out
+
+            B = BasicBlock(in_planes, out_planes, kernel_size, stride, padding, groups, dilation)
+            ast_rewriter = RewritingTracer()
+            graph = ast_rewriter.trace(B)
+            traced = GraphModule(ast_rewriter.root, graph, "gm")
+
+            # annotate our argument
+            for n in graph.nodes:
+                if n.op == 'placeholder':
+                    n.type = TensorType(annotation)
+
+            b = B.forward(torch.rand(input))
+            tc = GraphTypeChecker({}, traced)
+            tc.type_check()
+
+            for n in graph.nodes:
+                if n.op == 'output':
+                    assert is_consistent(n.type, TensorType(b.size()))
+
+            # test with intermediate annotations
+            class BasicBlock(torch.nn.Module):
+                def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+                    super(BasicBlock, self).__init__()
+                    self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
+                                                 kernel_size=kernel_size, stride=stride,
+                                                 padding=padding, groups=groups, bias=False, dilation=dilation)
+
+                def forward(self, x):
+                    out = self.conv1(x)
+                    return out
+
+            B = BasicBlock(in_planes, out_planes, kernel_size, stride, padding, groups, dilation)
+            ast_rewriter = RewritingTracer()
+            graph = ast_rewriter.trace(B)
+            traced = GraphModule(ast_rewriter.root, graph, "gm")
+
+            # populate our intermediate notes
+            for n in traced.graph.nodes:
+                if n.op == 'call_module':
+                    n.type = TensorType(intermediate_type)
+
+            tc = GraphTypeChecker({}, traced)
+            tc.type_check()
+
+            for n in traced.graph.nodes:
+                if n.op == 'output':
+                    assert n.type == TensorType(output_types[i])
+                    assert is_consistent(n.type, TensorType(b.size()))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py
index f4626660080bf..073514aed0882 100644
--- a/torch/fx/experimental/graph_gradual_typechecker.py
+++ b/torch/fx/experimental/graph_gradual_typechecker.py
@@ -5,6 +5,7 @@
 from typing import Callable, Dict
 from torch.fx.node import Target, Node
 from torch.nn.modules.batchnorm import BatchNorm2d
+from torch.nn.modules.conv import Conv2d
 
 
 _INFERENCE_RULES: Dict[Target, Callable] = {}
@@ -22,7 +23,7 @@ def expand_to_tensor_dim(t, n):
         return TensorType(tuple(dims))
     elif isinstance(t, TensorType):
         if len(t.__args__) != n:
-            raise TypeError(f'Cannot apply matching. Tensor {t} has rank {len(t.__args__)}. It should have rank {n}')
+            raise TypeError(f'Cannot extend tensor dimension. Tensor {t} has rank {len(t.__args__)}. It should have rank {n}')
         return t
     else:
         raise TypeError(f'Cannot match the type {t}')
@@ -207,6 +208,65 @@ def bn2d_inference_rule(n: Node, module_instance):
     else:
         raise TypeError(f'Cannot apply {module_instance} with input type {arg_type} and existing type {n.type} on {n}')
 
+def calculate(d_in, module_instance, index):
+    """
+    For calculating h_in and w_out.
+    """
+    if d_in == Dyn:
+        return Dyn
+
+    elif isinstance(d_in, int):
+        n = d_in + 2 * module_instance.padding[index] - \
+            module_instance.dilation[index] * \
+            (module_instance.kernel_size[index] - 1) - 1
+
+        return (n // module_instance.stride[0]) + 1
+    else:
+        raise TypeError(f'{d_in} in {module_instance} must be a number or Dyn')
+
+
+def get_greatest_upper_bound(type1, type2):
+    """
+    Get the most precise type that's consistent with the given types
+    """
+    if type1 == Dyn:
+        return type2
+    elif type2 == Dyn:
+        return type1
+    elif isinstance(type1, TensorType) and isinstance(type2, TensorType):
+        assert is_consistent(type1, type2)
+        gub = [t1 if is_more_precise(t1, t2) else t2 for (t1, t2) in zip(type1.__args__, type2.__args__)]
+        return TensorType(tuple(gub))
+    else:
+        raise NotImplementedError(f'Greatest upper bound not yet implemented for these types {type1}, {type2}')
+
+@register_inference_rule(Conv2d)
+def conv2d_inference_rule(n: Node, module_instance):
+    """
+    Given a Conv2D instance and a node check the following conditions:
+    - the input type can be expanded to a size 4 tensor: t =  (x_1, x_2, H, W)
+    - the current node type can be expanded to a size 4 tensor: t' =  (x_1', x_2', x_3', x_4')
+    - x_2 is consistent with the module's in_channels
+    - let o = (x_1, out_channels, H_out, W_out)
+    then the output is the greatest upper bound of o and the existing node type t'.
+    """
+    assert isinstance(n.args[0], Node)
+    n.args[0].type = expand_to_tensor_dim(n.args[0].type, 4)
+    arg_type = n.args[0].type
+    curr_node_type = expand_to_tensor_dim(n.type, 4)
+
+    if is_consistent(arg_type.__args__[1], module_instance.in_channels):
+        w_in = arg_type.__args__[3]
+        h_in = arg_type.__args__[2]
+        h_out = calculate(h_in, module_instance, 0)
+        w_out = calculate(w_in, module_instance, 1)
+        new_type = TensorType((arg_type.__args__[0], module_instance.out_channels, h_out, w_out))
+        gub = get_greatest_upper_bound(new_type, curr_node_type)
+        n.type = gub
+        return n.type
+    else:
+        raise TypeError(f'Cannot apply {module_instance} with input type { arg_type} and existing type {n.type} on {n}')
+
 class GraphTypeChecker:
     def __init__(self, env, traced):
         self.env = env

From 6a3170dba1b621308f80ef2bfcee181ebc20029d Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Fri, 9 Jul 2021 00:28:05 -0700
Subject: [PATCH 005/122] [package] minor cleanups to internal APIs (#61428)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61428

I was reading this code again after a while and didn't understand as
quickly as I would have liked. Some of the function names are no longer
accurate, etc.

This PR renames these functions to be in the same language of
"dependencies" that the rest of the API uses. I think the resulting
usage of the APIs is more clear than before

Test Plan: Imported from OSS

Reviewed By: Chillee

Differential Revision: D29620946

Pulled By: suo

fbshipit-source-id: 7df640a7ffbd43998063b9ee3955c9dfcbc42cfb
---
 torch/package/package_exporter.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 9d1cf87db7172..6c0ceaf924039 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -358,7 +358,7 @@ def save_source_string(
 
             for dep in deps:
                 self.dependency_graph.add_edge(module_name, dep)
-                self.require_module_if_not_provided(dep)
+                self.add_dependency(dep)
 
     def _write_source_string(
         self,
@@ -407,7 +407,10 @@ def _get_source_of_module(self, module: types.ModuleType) -> Optional[str]:
 
         return "".join(result)
 
-    def require_module_if_not_provided(self, module_name: str, dependencies=True):
+    def add_dependency(self, module_name: str, dependencies=True):
+        """Given a module, add it to the dependency graph according to patterns
+        specified by the user.
+        """
         if (
             module_name in self.dependency_graph
             and self.dependency_graph.nodes[module_name].get("provided") is True
@@ -436,7 +439,7 @@ def require_module_if_not_provided(self, module_name: str, dependencies=True):
                 # If we are interning this module, we need to retrieve its
                 # dependencies and package those as well.
                 if pattern_info.action == _ModuleProviderAction.INTERN:
-                    self._add_module_to_dependency_graph(module_name, dependencies)
+                    self._intern_module(module_name, dependencies)
                 return
 
         # No patterns have matched. Explicitly add this as an error.
@@ -459,15 +462,19 @@ def save_module(self, module_name: str, dependencies=True):
             )
 
         self.dependency_graph.add_node(
-            module_name, provided=True, action=_ModuleProviderAction.INTERN
+            module_name,
+            provided=True,
         )
-        self._add_module_to_dependency_graph(module_name, dependencies)
+        self._intern_module(module_name, dependencies)
 
-    def _add_module_to_dependency_graph(
+    def _intern_module(
         self,
         module_name: str,
         dependencies: bool,
     ):
+        """Adds the module to the dependency graph as an interned module,
+        along with any metadata needed to write it out to the zipfile at serialization time.
+        """
         module_obj = self._import_module(module_name)
 
         # Find dependencies of this module and require them as well.
@@ -487,6 +494,7 @@ def _add_module_to_dependency_graph(
                 error_context = f"filename: {filename}"
             self.dependency_graph.add_node(
                 module_name,
+                action=_ModuleProviderAction.INTERN,
                 is_package=is_package,
                 error=packaging_error,
                 error_context=error_context,
@@ -494,14 +502,18 @@ def _add_module_to_dependency_graph(
             return
 
         self.dependency_graph.add_node(
-            module_name, is_package=is_package, source=source, provided=True
+            module_name,
+            action=_ModuleProviderAction.INTERN,
+            is_package=is_package,
+            source=source,
+            provided=True,
         )
 
         if dependencies:
             deps = self._get_dependencies(source, module_name, is_package)
             for dep in deps:
                 self.dependency_graph.add_edge(module_name, dep)
-                self.require_module_if_not_provided(dep)
+                self.add_dependency(dep)
 
     def save_pickle(
         self, package: str, resource: str, obj: Any, dependencies: bool = True
@@ -549,7 +561,7 @@ def save_pickle(
 
             for module_name in all_dependencies:
                 self.dependency_graph.add_edge(name_in_dependency_graph, module_name)
-                self.require_module_if_not_provided(module_name)
+                self.add_dependency(module_name)
 
         self._write(filename, data_value)
 

From 15010bf2234c6d8bd4ee283b0ae8a6adb3b66b7d Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Fri, 9 Jul 2021 00:45:10 -0700
Subject: [PATCH 006/122] Make some downcast issues explicit (#60412)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/60412

Test Plan: Sandcastle

Reviewed By: ngimel

Differential Revision: D29243195

fbshipit-source-id: c508b729d6a0e6f8a591521bce788e6cfd8531f8
---
 c10/core/TensorOptions.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index d31458057cd4a..2d2077b769e19 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -211,7 +211,7 @@ struct C10_API TensorOptions {
   /// TODO: This function encourages bad behavior (assuming CUDA is
   /// the only device that matters).  Get rid of it / rename it.
   C10_NODISCARD TensorOptions
-  device_index(int16_t device_index) const noexcept {
+  device_index(c10::DeviceIndex device_index) const noexcept {
     return device(Device::Type::CUDA, device_index);
   }
 

From 8f61d946109c8be8eb3c842f6c2adc43e44dacca Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Fri, 9 Jul 2021 00:46:22 -0700
Subject: [PATCH 007/122] Fix a variable initialization (#60896)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/60896

Test Plan: Sandcastle

Reviewed By: ngimel

Differential Revision: D29431625

fbshipit-source-id: 076d5ed350507b3ab1f14c1a5c7700de0427eefc
---
 .../ao_sparse/quantized/cpu/qlinear_dynamic.cpp    | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp
index 0c0b4f9503eed..8d17b4285e015 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp
@@ -40,18 +40,14 @@ at::Tensor PackedLinearWeightQnnp::apply_dynamic_impl<false>(
       "quantized_sparse_lienar: Input tensor's last and weight tensor's"
       " second dimension must match.");
 
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  float x_min;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  float x_max;
+  // On empty input, no output data will be generated,
+  // so use arbitrary qparams.
+  float x_min = 0;
+  float x_max = 0;
+  // Otherwise...
   if (input.numel() > 0) {
     x_min = input.min().item<float>();
     x_max = input.max().item<float>();
-  } else {
-    // On empty input, no output data will be generated,
-    // so use arbitrary qparams.
-    x_min = 0;
-    x_max = 0;
   }
 
   auto q_params = quant_utils::ChooseQuantizationParams(

From 905cd6733e6b998cf268434fa31847554b91be18 Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Fri, 9 Jul 2021 00:50:03 -0700
Subject: [PATCH 008/122] [DDP Comm Hook] Re-enable the optimization of fusing
 copy and division when no comm hook is specified (#61379)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61379

The optimization was accidentally removed in https://github.com/pytorch/pytorch/pull/59574

This optimization can help save a scan over all the input parameters, by fusing copy and div operations.

Now the default temporary hook is allreduce by sum, and no extra division is done inside the hook.
ghstack-source-id: 133288529

Test Plan:
buck test mode/dev-nosan caffe2/test/distributed:distributed_nccl_fork -- test_accumulate_gradients_no_sync
buck test mode/dev-nosan caffe2/test/distributed:distributed_nccl_fork -- test_ddp_grad_div_uneven_inputs
buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_fp16
buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_fp16_grad_is_view
buck test mode/dev-nosan caffe2/test/distributed:distributed_nccl_fork --  test_DistributedDataParallel_non_default_stream

buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_sparse_gradient

buck test mode/dev-nosan caffe2/test/distributed:c10 -- test_ddp_checkpointing_once
buck test mode/dev-nosan caffe2/test/distributed:c10 -- test_ddp_checkpointing_twice

Reviewed By: rohan-varma

Differential Revision: D29597614

fbshipit-source-id: 2434e4fd4e6abad7871cfe47886fe97b6e4ba28f
---
 .../distributed/c10d/default_comm_hooks.cpp   |  6 ++--
 .../distributed/c10d/default_comm_hooks.hpp   | 26 ++++++----------
 torch/csrc/distributed/c10d/reducer.cpp       | 31 ++++++++++++++-----
 3 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.cpp b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
index ec55ad7f2ed7c..429ed7adf3d84 100644
--- a/torch/csrc/distributed/c10d/default_comm_hooks.cpp
+++ b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
@@ -36,12 +36,10 @@ c10::intrusive_ptr<c10::ivalue::Future> FP16CompressCommHook::runHook(
   return allreduce_fut->then(decompress, allreduce_fut->elementType());
 }
 
-c10::intrusive_ptr<c10::ivalue::Future> _AllReduceCommHookWithDivFactor::
+c10::intrusive_ptr<c10::ivalue::Future> _AllReduceBySumCommHook::
     runHook(GradBucket& bucket) {
   std::vector<at::Tensor> tensors = {bucket.getTensorRef()};
-  // Apply the division first to avoid overflow, especially for FP16.
-  tensors[0] /= state_.div_factor;
-  return state_.pg->allreduce(tensors)->getFuture();
+  return state_->allreduce(tensors)->getFuture();
 }
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.hpp b/torch/csrc/distributed/c10d/default_comm_hooks.hpp
index fbb2dad8de5f0..37b64a4badb36 100644
--- a/torch/csrc/distributed/c10d/default_comm_hooks.hpp
+++ b/torch/csrc/distributed/c10d/default_comm_hooks.hpp
@@ -30,25 +30,17 @@ class FP16CompressCommHook : public CppCommHookInterface<ProcessGroup*> {
   c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
 };
 
-struct _AllReduceCommHookWithDivFactorState {
-  _AllReduceCommHookWithDivFactorState(ProcessGroup* pg, int div_factor)
-      : pg(pg), div_factor(div_factor) {}
-
-  ProcessGroup* pg;
-  // Should be equal to the process group size, with the exception of unevent
-  // input.
-  int div_factor;
-};
-
-// Almost same as AllReduceCommHook, but requires an additional ``div_factor``
-// as the state for handling unevent input. Only used internally and not
-// released as a public built-in communication hook.
-class _AllReduceCommHookWithDivFactor
-    : public CppCommHookInterface<_AllReduceCommHookWithDivFactorState> {
+// Almost same as AllReduceCommHook, but without division inside the hook.
+// This enables the optimization of fusing copy and division and saves one scan
+// over all the input parameters, when no communication hook is provided by the user.
+// Only used internally and not released as a public built-in communication hook.
+class _AllReduceBySumCommHook
+    : public CppCommHookInterface<ProcessGroup*> {
  public:
-  using CppCommHookInterface::CppCommHookInterface;
+  explicit _AllReduceBySumCommHook(ProcessGroup* state)
+      : CppCommHookInterface<ProcessGroup*>(state) {}
 
-  ~_AllReduceCommHookWithDivFactor() override = default;
+  ~_AllReduceBySumCommHook() override = default;
 
   c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
 };
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 75c708d364e08..c7402f1908161 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -391,13 +391,25 @@ void Reducer::mark_variable_ready_dense(size_t variable_index) {
       // to bucket_view. If grad has already been set as views of buckets in
       // previous iterations, no copy is needed.
       if (!grad.is_alias_of(bucket_view)) {
-        bucket_view.copy_(grad);
+        if (comm_hook_ == nullptr) {
+          auto wrapped = at::native::wrapped_scalar_tensor(double(1.) / div_factor_);
+          // Divides while copying into the bucket view to save one scan over all the input parameters.
+          at::mul_out(bucket_view, grad, wrapped);
+        } else {
+          bucket_view.copy_(grad);
+        }
+
         if (gradient_as_bucket_view_) {
           // Let grad point to bucket_view buffer.
           grad = bucket_view;
           // The grad is modified and need to be written back.
           return true;
         }
+      } else {
+        // If grad and bucket view point to the same storage, no need to copy.
+        if (comm_hook_ == nullptr) {
+          bucket_view.div_(div_factor_);
+        }
       }
     } else {
       // Gradient is undefined. When find_unused_parameters=True, ensure it is
@@ -854,9 +866,7 @@ void Reducer::mark_variable_ready(size_t variable_index) {
 c10::intrusive_ptr<c10::ivalue::Future> Reducer::run_comm_hook(
     GradBucket& grad_bucket) {
   if (comm_hook_ == nullptr) {
-    _AllReduceCommHookWithDivFactorState state(
-        process_group_.get(), div_factor_);
-    _AllReduceCommHookWithDivFactor allreduce_hook(state);
+    _AllReduceBySumCommHook allreduce_hook(process_group_.get());
     return allreduce_hook.runHook(grad_bucket);
   } else {
     return comm_hook_->runHook(grad_bucket);
@@ -1445,10 +1455,6 @@ void Reducer::finalize_backward() {
   require_finalize_ = false;
   in_ddp_backwards_ = false;
 
-  // Unset allreduce division factor, as it may change in next backwards pass
-  // when running with DDP join mode.
-  div_factor_ = kUnsetDivFactor;
-
   // Wait for asynchronous reduction to complete and unflatten contents.
   for (auto& bucket : buckets_) {
     // See Note [DDP Communication Hook]
@@ -1463,6 +1469,11 @@ void Reducer::finalize_backward() {
     for (const auto i : c10::irange(future_result.size())) {
       auto& replica = bucket.replicas[i];
       if (bucket.expect_sparse_gradient) {
+        // If no DDP comm hook is registered,
+        // the allreduce only sums up the value, and a separate division is required.
+        if (comm_hook_ == nullptr) {
+          future_result[i].div_(div_factor_);
+        }
         replica.contents.copy_(future_result[i]);
       } else {
         // Reinitialize only `bucket_views_out` with the future_result by
@@ -1471,6 +1482,10 @@ void Reducer::finalize_backward() {
       }
     }
 
+    // Unset allreduce division factor, as it may change in next backwards pass
+    // when running with DDP join mode.
+    div_factor_ = kUnsetDivFactor;
+
     if (!bucket.expect_sparse_gradient) {
       // We don't need to finalize the sparse bucket since the sparse grad and
       // the bucket essentially point to the same storage. As a result, once

From 14f63763c16e77ac4793c938003e791dd71b217d Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@fb.com>
Date: Fri, 9 Jul 2021 00:59:50 -0700
Subject: [PATCH 009/122] Avoid using mp.Manager to report #GPUs needed in dist
 tests (#61409)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61409

We used a multiprocessing.Manager in order to share TEST_SKIPS between the parent and the child processes. TEST_SKIPS is a global variable that defines a unique error code for each "error type", so that the parent can figure out the reason a child exited. While originally this mapping was immutable, at some point we allowed children to modify the parent's value of that mapping so they could update the message for the `multi-gpu` error to make it reflect how many GPUs were really needed. This occurred in D23285790 (https://github.com/pytorch/pytorch/commit/2a4d312027f24898798e222b093e61a2427d5cee). Since then this Manager proved to be quite problematic, especially around thread safety, races, TSAN, ... (see D22753459 (https://github.com/pytorch/pytorch/commit/f0c46878c6c79fc9ac452ee72559daf0bddeb074), D23641618 (https://github.com/pytorch/pytorch/commit/567c51cce9cab86772824a589816e1644169a630), D28490129, D28794321 (https://github.com/pytorch/pytorch/commit/0128eb9a85ce2214858c5ea92d3e9de328d38468) and D29585862). This seems like an awful lot of trouble for such a small functionality. Here I propose we drop Manager and instead get the same result by using separate error codes for each number of GPUs. It should be much simpler and thus more robust.
ghstack-source-id: 133236447

Test Plan: CI

Reviewed By: pritamdamania87

Differential Revision: D29612614

fbshipit-source-id: 8ad0fedcb7796e5832a0eb196f8fdc147e02b3df
---
 torch/testing/_internal/common_distributed.py | 44 ++++++-------------
 .../_internal/distributed/distributed_test.py |  2 +-
 2 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index af97bf22ab6ea..5cf5d007ad271 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -36,7 +36,14 @@ class TestSkip(NamedTuple):
     "backend_unavailable": TestSkip(72, "Skipped because distributed backend is not available."),
     "small_worldsize": TestSkip(73, "Skipped due to small world size."),
     "no_cuda": TestSkip(74, "CUDA is not available."),
-    "multi-gpu": TestSkip(75, "Need at least 2 CUDA devices"),
+    "multi-gpu-1": TestSkip(75, "Need at least 1 CUDA device"),
+    "multi-gpu-2": TestSkip(77, "Need at least 2 CUDA devices"),
+    "multi-gpu-3": TestSkip(80, "Need at least 3 CUDA devices"),
+    "multi-gpu-4": TestSkip(81, "Need at least 4 CUDA devices"),
+    "multi-gpu-5": TestSkip(82, "Need at least 5 CUDA devices"),
+    "multi-gpu-6": TestSkip(83, "Need at least 6 CUDA devices"),
+    "multi-gpu-7": TestSkip(84, "Need at least 7 CUDA devices"),
+    "multi-gpu-8": TestSkip(85, "Need at least 8 CUDA devices"),
     "nccl": TestSkip(76, "c10d not compiled with NCCL support"),
     "skipIfRocm": TestSkip(78, "Test skipped for ROCm"),
     "no_peer_access": TestSkip(79, "Test skipped because no GPU peer access"),
@@ -49,10 +56,9 @@ def skip_if_no_gpu(func):
     def wrapper(*args, **kwargs):
         if not torch.cuda.is_available():
             sys.exit(TEST_SKIPS["no_cuda"].exit_code)
-        if torch.cuda.device_count() < int(os.environ["WORLD_SIZE"]):
-            message = "Need at least {} CUDA devices".format(os.environ["WORLD_SIZE"])
-            TEST_SKIPS["multi-gpu"] = TestSkip(75, message)
-            sys.exit(TEST_SKIPS["multi-gpu"].exit_code)
+        world_size = int(os.environ["WORLD_SIZE"])
+        if torch.cuda.device_count() < world_size:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{world_size}"].exit_code)
 
         return func(*args, **kwargs)
 
@@ -75,9 +81,7 @@ def decorator(func):
         @wraps(func)
         def wrapper(*args, **kwargs):
             if backend == "nccl" and torch.cuda.device_count() < n:
-                message = "Need at least {} CUDA devices".format(n)
-                TEST_SKIPS["multi-gpu"] = TestSkip(75, message)
-                sys.exit(TEST_SKIPS['multi-gpu'].exit_code)
+                sys.exit(TEST_SKIPS[f'multi-gpu-{n}'].exit_code)
             else:
                 return func(*args, **kwargs)
         return wrapper
@@ -91,9 +95,7 @@ def decorator(func):
         def wrapper(*args, **kwargs):
             if torch.cuda.is_available() and torch.cuda.device_count() >= x:
                 return func(*args, **kwargs)
-            message = "Need at least {} CUDA devices".format(x)
-            TEST_SKIPS["multi-gpu"] = TestSkip(75, message)
-            sys.exit(TEST_SKIPS['multi-gpu'].exit_code)
+            sys.exit(TEST_SKIPS[f'multi-gpu-{x}'].exit_code)
         return wrapper
 
     return decorator
@@ -108,9 +110,7 @@ def wrapper(*args, **kwargs):
                 return func(*args, **kwargs)
             if torch.cuda.is_available() and torch.cuda.device_count() >= x:
                 return func(*args, **kwargs)
-            message = "Need at least {} CUDA devices".format(x)
-            TEST_SKIPS["multi-gpu"] = TestSkip(75, message)
-            sys.exit(TEST_SKIPS['multi-gpu'].exit_code)
+            sys.exit(TEST_SKIPS[f'multi-gpu-{x}'].exit_code)
         return wrapper
 
     return decorator
@@ -415,8 +415,6 @@ def setUp(self) -> None:
         self.processes = []  # type: ignore[var-annotated]
         self.rank = self.MAIN_PROCESS_RANK
         self.file_name = tempfile.NamedTemporaryFile(delete=False).name
-        global TEST_SKIPS
-        self.old_test_skips = TEST_SKIPS.copy()
         # pid to pipe consisting of error message from process.
         self.pid_to_pipe = {}  # type: ignore[var-annotated]
 
@@ -435,17 +433,6 @@ def _current_test_name(self) -> str:
         return self.id().split(".")[-1]
 
     def _start_processes(self, proc) -> None:
-        # Creating a Manager will spawn a subprocess which will in turn launch
-        # a thread. TSAN doesn't like this because there could have been other
-        # threads already in the parent process and mixing all this is unsafe.
-        # Instead we should exec after the fork (i.e., use the "spawn" method)
-        # so that we reset the subprocess's state before creating new threads.
-        test_skips_manager = torch.multiprocessing.get_context("spawn").Manager()
-        test_skips = test_skips_manager.dict()
-        global TEST_SKIPS
-        test_skips.update(TEST_SKIPS)
-        TEST_SKIPS = test_skips
-
         self.processes = []
         for rank in range(int(self.world_size)):
             parent_conn, child_conn = torch.multiprocessing.Pipe()
@@ -613,9 +600,6 @@ def _join_processes(self, fn) -> None:
             for pid, pipe in self.pid_to_pipe.items():
                 pipe.close()
 
-            global TEST_SKIPS
-            TEST_SKIPS = self.old_test_skips
-
     def _check_no_test_errors(self, elapsed_time) -> None:
         """
         Checks that we didn't have any errors thrown in the child processes.
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 726ca4497ef4e..d5b06bb5530e1 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -520,7 +520,7 @@ def _run(cls, rank, test_name, file_name, pipe):
         if torch.cuda.is_available() and torch.cuda.device_count() < int(
             self.world_size
         ):
-            sys.exit(TEST_SKIPS["multi-gpu"].exit_code)
+            sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
         try:
             pg_timeout_seconds = CUSTOM_PG_TIMEOUT.get(test_name, default_pg_timeout)
             timeout = timedelta(seconds=pg_timeout_seconds)

From 819bac63ff1b6db7252e59c64dc108f241c527f4 Mon Sep 17 00:00:00 2001
From: CodemodService Bot <>
Date: Fri, 9 Jul 2021 04:32:59 -0700
Subject: [PATCH 010/122] [Codemod][FBSourceBlackLinter] Daily `arc lint --take
 BLACK`

Reviewed By: zertosh

Differential Revision: D29632524

fbshipit-source-id: 3eccc1804a7bf953480b9754f68ea56a2a8e3fd8
---
 test/package/test_dependency_api.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/package/test_dependency_api.py b/test/package/test_dependency_api.py
index 6ec178c72cd32..bbd9c4401cfff 100644
--- a/test/package/test_dependency_api.py
+++ b/test/package/test_dependency_api.py
@@ -271,9 +271,7 @@ def import_module(self, module_name):
         buffer = BytesIO()
 
         with self.assertRaises(PackagingError) as e:
-            with PackageExporter(
-                buffer, importer=BrokenImporter()
-            ) as exporter:
+            with PackageExporter(buffer, importer=BrokenImporter()) as exporter:
                 exporter.intern(["foo", "bar"])
                 exporter.save_source_string("my_module", "import foo; import bar")
 

From 9b908ab0d0a947d89ac3137f8c4a05a87c35f568 Mon Sep 17 00:00:00 2001
From: CodemodService FBSourceClangFormatLinterBot <>
Date: Fri, 9 Jul 2021 04:33:13 -0700
Subject: [PATCH 011/122] [AutoAccept][Codemod][FBSourceClangFormatLinter]
 Daily `arc lint --take CLANGFORMAT`

Reviewed By: zertosh

Differential Revision: D29631829

fbshipit-source-id: 6cef1a3a091bdf0e10838d05b2e82fc0760ebe48
---
 torch/csrc/jit/passes/onnx/constant_fold.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/jit/passes/onnx/constant_fold.h b/torch/csrc/jit/passes/onnx/constant_fold.h
index 2ccf7eb28ed18..58d6eb3707e7a 100644
--- a/torch/csrc/jit/passes/onnx/constant_fold.h
+++ b/torch/csrc/jit/passes/onnx/constant_fold.h
@@ -20,7 +20,7 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
     const Node* node,
     std::vector<at::Tensor>& inputTensorValues,
     int opset_version);
-}
+} // namespace onnx_constant_fold
 
 void ConstantFoldONNX(
     Block* b,

From 8423ab4f99fb499d540316e047f0d4a0c9ad630c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 9 Jul 2021 07:52:22 -0700
Subject: [PATCH 012/122] Fix `CosineAnnealingWarmRestart` annotation (#61106)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/44770.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61106

Reviewed By: 1ntEgr8

Differential Revision: D29635764

Pulled By: walterddr

fbshipit-source-id: ddc45a7f04532a76d033ae7774706da1fa8608f7
---
 torch/optim/lr_scheduler.pyi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/optim/lr_scheduler.pyi b/torch/optim/lr_scheduler.pyi
index ec42bdc10c744..1c49c6e5d08a3 100644
--- a/torch/optim/lr_scheduler.pyi
+++ b/torch/optim/lr_scheduler.pyi
@@ -36,4 +36,4 @@ class CyclicLR(_LRScheduler):
     def __init__(self, optimizer: Optimizer, base_lr: float=..., max_lr: float=..., step_size_up: int=..., step_size_down: int=..., mode: str=..., gamma: float=..., scale_fn: Optional[Callable[[float], float]]=..., scale_mode: str=..., cycle_momentum: bool=..., base_momentum: float=..., max_momentum: float=..., last_epoch: int=...) -> None: ...
 
 class CosineAnnealingWarmRestarts(_LRScheduler):
-    def __init__(self, optimizer: Optimizer, T_0: int=..., T_mult: int=..., eta_min: int=..., last_epoch: int=...) -> None: ...
+    def __init__(self, optimizer: Optimizer, T_0: int=..., T_mult: int=..., eta_min: float=..., last_epoch: int=...) -> None: ...

From 179249084b66b8fbacbd95d39b7cbce0cd9eb972 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 9 Jul 2021 08:24:23 -0700
Subject: [PATCH 013/122] Refactor DDP join() API, adding hooks (#60757)

Summary:
Targets https://github.com/pytorch/pytorch/issues/54318.

**Overview:**
DDP offers a `join()` context manager to accommodate training on uneven inputs. This creates a new generic `_Join()` API permitting custom hooks, refactors DDP `join()` to call this generic `_Join()`, and implements a hook for ZeRO. (For now, the generic `_Join()` is implemented as private, but this may change after design discussions are cleared.)

There are two classes introduced: `_JoinHook`, the class defining the customizable join hook, and `_Join`, the generic join context manager.

The `_JoinHook` provides two entry points: `main_hook()`, which is called repeatedly while there exists a non-joined process, and `post_hook()`, which is called once all process have joined with the additional `bool` argument `is_last_joiner`. The class also requires `process_group` and `device` information by defining corresponding abstract property methods. Thus, to implement a join hook, (1) inherit from `_JoinHook`, (2) override `main_hook()` and `post_hook()` as appropriate, and (3) override `process_group()` and `device()` to provide process group and device information to be used by the join context manager implementation for collective communications.

The `_Join` constructor requires `join_hooks: List[_JoinHook]` and optionally `enable: bool = True` and `throw_on_early_termination: bool = False`. A training loop only needs to be wrapped with `with _Join(join_hooks):` (using the appropriate `join_hooks`) to be able to train on uneven inputs without hanging/erroring. The context manager requires a `dist.all_reduce(torch.ones(1))` to be called on every non-joined process each time before it performs its collective communications in order to indicate that the process has not yet joined. It also requires that all `process_group` attributes in the `_JoinHook` objects are the same.

**Notes:**
- The argument `is_last_joiner` to `post_hook()` may be useful for finding an authoritative rank when synchronizing.
- `enable` is a flag that can be set to `False` if the user knows the current training loop will not have uneven inputs. This may be used to disable join-related computation in  the classes providing join hooks.
- `throw_on_early_termination` is a flag that can be set to `True` to notify processes to terminate upon detecting uneven inputs (i.e. upon the first process joining when there exists a non-joined process). Notably, the notification requires an all-reduce, so to prevent hanging/erroring, non-joined process must participate in the all-reduce. The first-joining process raises a `RuntimeError`, and the other processes are expected (but not required) to do the same. This may be used to implement training on uneven inputs in cases that do not conform to the generic join context manager (e.g. `SyncBatchNorm`).
- Classes providing a join hook should do so via a `_join_hook()` method that returns a `_JoinHook` instance with the methods appropriately overridden.
- If there are multiple join hooks, the device specified by the first is used by the join context manager implementation to perform its collective communications.
- If there are multiple join hooks, both the main and post-hooks are iterated in the order in which the `_JoinHook` objects are passed into the context manager constructor.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/60757

Test Plan:
The current implementation preserves backward compatibility by not changing the existing DDP `join()` API at all. To check this, I ran through the uneven input tests (`test_ddp_grad_div_uneven_inputs`, `test_ddp_uneven_inputs_stop_iteration_sync_bn`, `test_ddp_uneven_inputs`, `test_ddp_uneven_input_join_disable`, `test_ddp_uneven_input_exception`) on the AI AWS cluster:
```
touch /tmp/barrier && TEMP_DIR="/tmp" BACKEND="nccl" WORLD_SIZE="2" gpurun python test/distributed/test_distributed_fork.py --
```

Because the existing DDP join logic does not provide correct gradients to the joined processes if `gradient_as_bucket_view=False` and a joined process requires those gradients to correctly update its shard of the parameters in `ZeroRedundancyOptimizer.step()`, DDP and ZeRO are not fully compatible at the moment. To work around this and to test ZeRO's join hook separately, I added a test `_test_zero_join()` (with `test_zero_join_gpu()` and `test_zero_join_cpu()` flavors), which compares DDP with a local optimizer on uneven inputs against ZeRO on uneven inputs with the gradients set manually.

Reviewed By: iramazanli, mrshenli

Differential Revision: D29624636

Pulled By: andwgu

fbshipit-source-id: ec70a290e02518b0d8b683f9fed2126705b896c7
---
 .../optim/test_zero_redundancy_optimizer.py   | 151 +++++++++++-
 torch/distributed/algorithms/join.py          | 215 ++++++++++++++++++
 .../optim/zero_redundancy_optimizer.py        | 103 ++++++---
 torch/nn/parallel/distributed.py              | 192 +++++++++-------
 4 files changed, 536 insertions(+), 125 deletions(-)
 create mode 100644 torch/distributed/algorithms/join.py

diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py
index 0dd0dd9b98039..a18de8acace72 100644
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@@ -7,20 +7,22 @@
 import os
 import sys
 from contextlib import suppress
-from typing import List, Any, Type, cast
+from typing import Any, List, Type, cast
 
 import numpy as np
+
 import torch
 import torch.distributed as dist
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
+from torch.distributed.algorithms.join import _Join, _JoinHook
 from torch.distributed.optim import ZeroRedundancyOptimizer
 from torch.distributed.optim.zero_redundancy_optimizer import _broadcast_object
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import SGD
-from torch.testing._internal import common_utils, common_distributed
+from torch.testing._internal import common_distributed, common_utils
 
 BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -60,11 +62,11 @@ def tearDown(self):
         except OSError:
             pass
 
-    def dist_init(self, rank, world_size=-1):
+    def dist_init(self, rank, world_size=-1, backend=BACKEND):
         if (world_size < 1):
             world_size = self.world_size
         store = dist.FileStore(self.file_name, world_size)
-        return dist.init_process_group(backend=BACKEND, store=store, rank=rank, world_size=world_size)
+        return dist.init_process_group(backend=backend, store=store, rank=rank, world_size=world_size)
 
 
 class TestZeroRedundancyOptimizerSingleRank(TestZeroRedundancyOptimizer):
@@ -484,8 +486,7 @@ def closure():
 
     def test_multiple_groups(self):
         """ Check that the ZeroRedundancyOptimizer handles working with multiple process groups"""
-        store = dist.FileStore(self.file_name, self.world_size)
-        dist.init_process_group(backend="gloo", store=store, rank=self.rank, world_size=self.world_size)
+        self.dist_init(self.rank, self.world_size, dist.Backend.GLOO)
 
         # Only work with the even ranks, to check that the global_rank indexing is properly used
         sub_group_ranks = list(filter(lambda x: x % 2 == 0, range(self.world_size)))
@@ -536,7 +537,7 @@ def closure():
 
             # With SGD, Momentum is required to get a state to shard
             optimizer = ZeroRedundancyOptimizer(
-                model.parameters(), optimizer_class=SGD, lr=0.1, momentum=0.99, group=process_group
+                model.parameters(), optimizer_class=SGD, lr=0.1, momentum=0.99, process_group=process_group
             )
             check(optimizer)
 
@@ -552,7 +553,7 @@ def closure():
                 optimizer_class=SGD,
                 lr=0.1,
                 momentum=0.99,
-                group=process_group,
+                process_group=process_group,
             )
             check(optimizer)
 
@@ -656,6 +657,140 @@ def closure_sharded(input_tensor=input_tensor):
             for opt in [torch.optim.SGD, torch.optim.Adam]:
                 check_optimizer_equivalence(opt)
 
+    def _test_zero_join(self, device):
+        r"""
+        Check that the ZeRO join hook allows training with uneven inputs when using the given device.
+
+        Arguments:
+            device (torch.device): device used to store parameters and perform
+                collective communications.
+        """
+        NUM_INPUTS = 3
+        NUM_EPOCHS = 2
+        torch.manual_seed(0)
+        torch.cuda.manual_seed(0)
+
+        rank = self.rank
+        world_size = self.world_size
+        is_gpu = device.type == "cuda"
+        backend = dist.Backend.NCCL if is_gpu else dist.Backend.GLOO
+        self.dist_init(rank, world_size, backend)
+        if BACKEND == dist.Backend.NCCL and is_gpu:
+            torch.cuda.set_device(self.device)
+
+        model = torch.nn.Sequential(
+            torch.nn.Linear(2, 3),
+            torch.nn.Linear(3, 3),
+            torch.nn.Linear(3, 3),
+        )
+        model.to(device)
+
+        # DDP ensures correct gradients in data parallel training, so DDP with
+        # local optimizers on uneven inputs should be equivalent to ZeRO on
+        # uneven inputs with gradients being manually set
+        ddp_model = DDP(model, device_ids=[rank]) if is_gpu else DDP(model)
+        local_optim = torch.optim.Adam(ddp_model.parameters(), lr=0.01)
+        zero_model = copy.deepcopy(model)
+        zero_model.to(device)
+        zero_optim = ZeroRedundancyOptimizer(zero_model.parameters(), torch.optim.Adam, lr=0.01)
+        loss_fn = torch.nn.MSELoss()
+
+        # Use uneven inputs: rank i has i extra inputs
+        inputs = [torch.randn(20, 2).to(device) for _ in range(NUM_INPUTS + rank)]
+        labels = torch.randn(20, 3).to(device)
+
+        # Save the gradients and parameters from DDP as the ground truth; do
+        # so on the last-joining rank (in this case, the largest rank)
+        grads_at_each_iter = []
+        params_at_each_iter = []
+        with ddp_model.join():
+            for _ in range(NUM_EPOCHS):
+                for input in inputs:
+                    output = ddp_model(input)
+                    loss_fn(output, labels).backward()
+                    if rank == world_size - 1:
+                        grads = []
+                        for p in ddp_model.parameters():
+                            grads.append(p.grad.detach().clone().to(device))
+                    local_optim.step()
+                    if rank == world_size - 1:
+                        params = []
+                        for p in ddp_model.parameters():
+                            params.append(p.detach().clone().to(device))
+                        grads_at_each_iter.append(grads)
+                        params_at_each_iter.append(params)
+
+        # Broadcast the saved gradients and parameters to all of the other
+        # ranks (which joined early)
+        grads_and_params = [grads_at_each_iter, params_at_each_iter]
+        grads_and_params = _broadcast_object(grads_and_params, src_rank=world_size - 1, group=dist.group.WORLD, device=device)
+        grads_at_each_iter = grads_and_params[0]
+        params_at_each_iter = grads_and_params[1]
+        # TODO: Replace this `_broadcast_object` with `broadcast_object_list`
+        # once the latter supports loading to the destination device instead
+        # of the source device
+
+        # A process must still set the remaining gradients after joining, so we
+        # define a join hook to do this before the ZeRO join hook
+        class _JoinGradInfo():
+            def __init__(self, grads, device):
+                self.grads = grads  # remaining gradients to set (in order)
+                self.index = 0
+                self.device = device
+
+        class _SetGradsJoinHook(_JoinHook):
+            def __init__(self, zero_optim, grads, device):
+                zero_optim._join_grad_info = _JoinGradInfo(grads, device)
+                self.zero = zero_optim
+                super().__init__()
+
+            def main_hook(self):
+                grads = self.zero._join_grad_info.grads[self.zero._join_grad_info.index]
+                self.zero._join_grad_info.index += 1
+                for p, grad in zip(self.zero._all_params, grads):
+                    p.grad = grad.detach().clone().to(self.zero._join_grad_info.device)
+
+            @property
+            def device(self):
+                return self.zero._join_grad_info.device
+
+            @property
+            def process_group(self):
+                return dist.group.WORLD
+
+        num_grads_after_joining = NUM_EPOCHS * (world_size - rank - 1)
+        grads = grads_at_each_iter[-num_grads_after_joining:]
+        set_grads_jh = _SetGradsJoinHook(zero_optim, grads, device)
+        zero_jh = zero_optim._join_hook()
+        iter = 0
+        with _Join([set_grads_jh, zero_jh]):
+            for _ in range(NUM_EPOCHS):
+                for input in inputs:
+                    # Schedule an all-reduce to indicate not joined
+                    dist.all_reduce(torch.ones(1, device=device), group=dist.group.WORLD)
+
+                    # Set gradients manually
+                    for p, grad in zip(zero_model.parameters(), grads_at_each_iter[iter]):
+                        p.grad = grad.detach().clone().to(device)
+
+                    # Perform optimizer step and check parity
+                    zero_optim.step()
+                    for p, ddp_p in zip(zero_model.parameters(), params_at_each_iter[iter]):
+                        assert torch.allclose(p, ddp_p), \
+                            "Parameters differ between using ZeRO and local optimizer"
+                    iter += 1
+
+    @common_distributed.requires_nccl()
+    @common_distributed.skip_if_lt_x_gpu(2)
+    def test_zero_join_gpu(self):
+        """Check that the ZeRO join hook allows training with uneven inputs on GPU."""
+        self._test_zero_join(self.device)
+
+    @common_distributed.requires_gloo()
+    def test_zero_join_cpu(self):
+        """Check that the ZeRO join hook allows training with uneven inputs on CPU."""
+        self._test_zero_join(torch.device("cpu"))
+
 
 if __name__ == "__main__":
     # ! unittest should not be used here, else the tests are not properly registered
diff --git a/torch/distributed/algorithms/join.py b/torch/distributed/algorithms/join.py
new file mode 100644
index 0000000000000..3754fb312b75e
--- /dev/null
+++ b/torch/distributed/algorithms/join.py
@@ -0,0 +1,215 @@
+import warnings
+from abc import ABC, abstractmethod
+from typing import List
+
+import torch
+import torch.distributed as dist
+
+
+class _JoinHook(ABC):
+    r"""
+    This defines a join hook, which provides two entry points in the join
+    context manager: a main hook, which is called repeatedly while there exists
+    a non-joined process, and a post-hook, which is called once all processes
+    have joined.
+
+    To implement a join hook for the generic join context manager, define a
+    class that inherits from :class:`_JoinHook`, override ``main_hook()`` and
+    ``post_hook()`` as appropriate, and override ``device()`` and
+    ``process_group()`` to provide the device and process group information,
+    respectively, which are needed for the join context manager implementation.
+    """
+    def main_hook(self):
+        r"""
+        This hook is called repeatedly while there exists a non-joined process
+        to shadow collective communications in the forward pass, backward pass,
+        and optimizer.
+        """
+        ...
+
+    def post_hook(self, is_last_joiner: bool):
+        r"""
+        This hook is called after all processes have joined. It is passed an
+        additional ``bool`` argument ``is_last_joiner``, which indicates if the
+        rank is one of the last to join.
+
+        Arguments:
+            is_last_joiner (bool): ``True`` if the rank is one of the last to
+                join; ``False`` otherwise.
+        """
+        ...
+
+    @property
+    @abstractmethod
+    def device(self):
+        r"""
+        Returns the device from which to perform collective communications
+        needed for the join context manager implementation itself.
+        """
+        ...
+
+    @property
+    @abstractmethod
+    def process_group(self):
+        r"""
+        Returns the process group for join-related collective communications.
+        """
+        ...
+
+class _Join():
+    r"""
+    This class defines the generic join context manager, which allows custom
+    hooks to be called after a process joins. These hooks should shadow the
+    collective communications of non-joined processes to prevent hanging and
+    erroring and to ensure algorithmic correctness. Refer to :class:`_JoinHook`
+    for details about the hook definition.
+
+    .. warning::
+        The context manager requires a ``dist.all_reduce(torch.ones(1))`` to be
+        called on every non-joined process each time before it performs its
+        collective communications in order to indicate that the process has not
+        yet joined. For example, this can occur at the beginning of the forward
+        pass.
+
+    .. warning::
+        If ``throw_on_early_termination`` is enabled, then the context manager
+        additionally requires every non-joined process to participate in an
+        all-reduce before it performs its collective communications in order to
+        check if it should terminate due to detecting uneven inputs. This all-
+        reduce should be of the form ``dist.all_reduce(torch.zeros(1))``; if
+        the result is positive, then the process should terminate.
+
+    .. warning::
+        The context manager requires that all ``process_group`` attributes in
+        the ``_JoinHook`` objects are the same. If there are multiple
+        ``_JoinHook`` objects, then the ``device`` of the first is used. The
+        process group and device information is used for checking for non-
+        joined processes and for notifying processes to terminate if
+        ``throw_on_early_termination`` is eanbled, both of which using an all-
+        reduce.
+
+    Arguments:
+        join_hooks (List[_JoinHook]): a list of the :class:`_JoinHook` s to
+            use; the hooks are iterated over in the given order.
+
+        enable (bool): a flag enabling uneven input detection; setting to
+            ``False`` disables the context manager's functionality and should
+            only be set when the user knows the inputs will not be uneven
+            (default: ``True``).
+
+        throw_on_early_termination (bool): a flag controlling whether to raise
+            an exception upon detecting uneven inputs (default: ``False``).
+
+    """
+    def __init__(
+        self,
+        join_hooks: List[_JoinHook],
+        enable: bool = True,
+        throw_on_early_termination: bool = False,
+    ):
+        if len(join_hooks) == 0:
+            raise ValueError("The join context manager requires at least one join hook")
+        self._join_hooks = join_hooks
+        self._enable = enable
+        self._throw_on_early_termination = throw_on_early_termination
+        self._extract_dist_info()
+
+    def _extract_dist_info(self):
+        r"""
+        Extracts the process group and device information from the join hooks.
+
+        Preconditions:
+            ``self._join_hooks`` is not ``None`` and is non-empty.
+
+        Raises:
+            ValueError
+                If there are multiple conflicting ``process_group`` attributes
+                among the ``_JoinHook`` objects.
+
+        NOTE: The context manager uses the first specified device.
+        """
+        process_group = None
+        device = None
+        for join_hook in self._join_hooks:
+            if process_group is None:
+                process_group = join_hook.process_group
+            elif process_group != join_hook.process_group:
+                raise ValueError("Using join context manager with multiple process groups")
+            if device is None:
+                device = join_hook.device
+        self._process_group = process_group
+        self._rank = dist.get_rank(self._process_group)
+        self._device = device
+
+    def __enter__(self):
+        ...
+
+    def __exit__(self, type, value, traceback):
+        r"""
+        Repeatedly runs the main hooks until all processes join; then, runs
+        the post-hooks.
+
+        Raises:
+            RuntimeError
+                If ``throw_on_early_termination`` is enabled.
+        """
+        if not self._enable or type:
+            return  # propagate the exception directly if one was raised
+
+        all_procs_joined = False
+        is_last_joiner = True
+
+        i = 0
+        WARN_THRESHOLD = 1000
+        warnings.simplefilter("once")
+
+        while not all_procs_joined:
+            if i > WARN_THRESHOLD:
+                warnings.warn(
+                    "Detected uneven input skew of greater than "
+                    f"{WARN_THRESHOLD}. This means that rank "
+                    f"{self._rank} has at least {WARN_THRESHOLD} "
+                    f"fewer inputs than other currently-active ranks. "
+                    "This level of skew could lead to performance "
+                    "degradataion during training."
+                )
+            # Shadow the all-reduce in non-joined processes
+            num_nonjoined_procs = self._get_num_nonjoined_procs()
+            if num_nonjoined_procs == 0:
+                all_procs_joined = True
+            else:
+                if self._throw_on_early_termination:
+                    self._notify_procs_to_terminate()
+
+                # Run main hooks
+                for join_hook in self._join_hooks:
+                    join_hook.main_hook()
+
+                is_last_joiner = False
+                i += 1
+
+        # Run post-hooks
+        for join_hook in self._join_hooks:
+            join_hook.post_hook(is_last_joiner)
+
+    def _get_num_nonjoined_procs(self):
+        r"""
+        Returns the number of non-joined processes by shadowing an all-reduce
+        in the non-joined processes.
+        """
+        num_nonjoined_procs = torch.zeros(1, device=self._device)
+        dist.all_reduce(num_nonjoined_procs, group=self._process_group)
+        return num_nonjoined_procs.item()
+
+    def _notify_procs_to_terminate(self):
+        r"""
+        Schedules an all-reduce to notify non-joined processes to terminate
+        and raises a ``RuntimeError`` indicating that the current process has
+        exhausted its inputs.
+        """
+        ones = torch.ones(1, device=self._device)
+        dist.all_reduce(ones, group=self._process_group)
+        # NOTE: Raising `StopIteration` does not throw an error in Python 3.6
+        # and throws a `RuntimeError` in Python 3.7+ (PEP 479), so we just
+        # raise a `RuntimeError` here
+        raise RuntimeError(f"Rank {self._rank} exhausted all inputs.")
diff --git a/torch/distributed/optim/zero_redundancy_optimizer.py b/torch/distributed/optim/zero_redundancy_optimizer.py
index ad5de59d934cf..24921e40942ad 100644
--- a/torch/distributed/optim/zero_redundancy_optimizer.py
+++ b/torch/distributed/optim/zero_redundancy_optimizer.py
@@ -6,12 +6,13 @@
 import collections
 import copy
 import io
+import logging
 from itertools import chain
 from typing import Any, Callable, Dict, List, Optional, Type
 
-import logging
 import torch
 import torch.distributed as dist
+from torch.distributed.algorithms.join import _JoinHook
 from torch.optim import Optimizer
 
 __all__ = ["ZeroRedundancyOptimizer"]
@@ -102,6 +103,30 @@ def _get_global_rank(group: Any, rank: int) -> int:
             else dist.distributed_c10d._get_global_rank(group, rank))
 
 
+class _ZeROJoinHook(_JoinHook):
+    def __init__(self, zero):
+        assert isinstance(zero, ZeroRedundancyOptimizer), \
+            "ZeRO join hook requires passing in a ZeroRedundancyOptimizer " \
+            "instance as the state"
+        self.zero = zero
+        super().__init__()
+
+    def main_hook(self):
+        """
+        Performs an optimizer step, which updates the joined process's shard of
+        the parameters and broadcasts those parameters.
+        """
+        self.zero.step()
+
+    @property
+    def device(self):
+        return self.zero._default_device
+
+    @property
+    def process_group(self):
+        return self.zero.process_group
+
+
 class ZeroRedundancyOptimizer(Optimizer):
     r"""
     This class wraps an arbitrary :class:`optim.Optimizer
@@ -127,7 +152,7 @@ class ZeroRedundancyOptimizer(Optimizer):
     Keyword Args:
         optimizer_class (:class:`torch.nn.Optimizer`): the class of the local
             optimizer.
-        group (``ProcessGroup``, optional): ``torch.distributed``
+        process_group (``ProcessGroup``, optional): ``torch.distributed``
             ``ProcessGroup`` (default: ``dist.group.WORLD`` initialized by
             :meth:`torch.distributed.init_process_group`).
         parameters_as_bucket_view (bool): when enabled, parameters are packed
@@ -168,7 +193,7 @@ def __init__(
         self,
         params,
         optimizer_class: Type[Optimizer],
-        group: Optional[Any] = None,
+        process_group: Optional[Any] = None,
         parameters_as_bucket_view: bool = False,
         **defaults: Any,
     ):
@@ -196,10 +221,10 @@ def __init__(
         # Default device for collective communication and buckets
         self._default_device = self._all_params[0].device
 
-        self.group = group if group is not None else dist.group.WORLD
-        self.world_size = dist.get_world_size(self.group)
-        self.rank = dist.get_rank(self.group)
-        self.global_rank = _get_global_rank(self.group, self.rank)
+        self.process_group = process_group if process_group is not None else dist.group.WORLD
+        self.world_size = dist.get_world_size(self.process_group)
+        self.rank = dist.get_rank(self.process_group)
+        self.global_rank = _get_global_rank(self.process_group, self.rank)
 
         self._optim_defaults = defaults
         self._optim_constructor = optimizer_class
@@ -287,7 +312,7 @@ def consolidate_state_dict(self, to: int = 0) -> None:
         # is to move all sharded state management to RPC RRef
         self._all_state_dicts = []
         for rank in range(self.world_size):
-            global_rank = _get_global_rank(self.group, rank)
+            global_rank = _get_global_rank(self.process_group, rank)
             if self.rank == to:
                 # Consolidate all local `state_dict`s on this rank, storing on
                 # CPU to save GPU memory
@@ -301,7 +326,7 @@ def consolidate_state_dict(self, to: int = 0) -> None:
                     local_state_dict = _broadcast_object(
                         empty_messenger,
                         src_rank=global_rank,
-                        group=self.group,
+                        group=self.process_group,
                         device=self._default_device,
                     )
                     self._all_state_dicts.append(
@@ -313,7 +338,7 @@ def consolidate_state_dict(self, to: int = 0) -> None:
                     _ = _broadcast_object(
                         self.optim.state_dict(),
                         src_rank=self.global_rank,
-                        group=self.group,
+                        group=self.process_group,
                         device=self._default_device,
                     )
                 elif rank != to:
@@ -322,7 +347,7 @@ def consolidate_state_dict(self, to: int = 0) -> None:
                     _ = _broadcast_object(
                         empty_messenger,
                         src_rank=global_rank,
-                        group=self.group,
+                        group=self.process_group,
                         device=self._default_device,
                     )
 
@@ -395,6 +420,34 @@ def _index_to_param(self) -> Dict[int, torch.Tensor]:
             self._index_to_param_cache = list(chain(*(g["params"] for g in self.param_groups)))
         return self._index_to_param_cache
 
+    def _sync_parameters(self):
+        r"""
+        Syncs all parameter shards across the ranks.
+
+        The rank sends its shard to all other ranks and receives a shard from
+        each other rank using ``broadcast()``. Parameters are sent bucket-by-
+        bucket if ``parameters_as_bucket_view`` is enabled and sent parameter-
+        by-parameter otherwise.
+        """
+        handles = []
+        if self.parameters_as_bucket_view:
+            for rank, bucket in enumerate(self._buckets):
+                global_rank = _get_global_rank(self.process_group, rank)
+                handles.append(
+                    dist.broadcast(tensor=bucket, src=global_rank,
+                                   group=self.process_group, async_op=True)
+                )
+        else:
+            for rank, param_groups in enumerate(self._partition_parameters()):
+                global_rank = _get_global_rank(self.process_group, rank)
+                for param_group in param_groups:
+                    for param in param_group["params"]:
+                        handles.append(
+                            dist.broadcast(tensor=param.data, src=global_rank,
+                                           group=self.process_group, async_op=True)
+                        )
+        _ = list(map(lambda x: x.wait(), handles))
+
     def step(
         self,
         closure: Optional[Callable[[], float]] = None,
@@ -432,24 +485,7 @@ def step(
             loss = self.optim.step(**kwargs)
 
         # Sync all of the updated parameter shards across the ranks
-        handles = []
-        if self.parameters_as_bucket_view:
-            for rank, bucket in enumerate(self._buckets):
-                global_rank = _get_global_rank(self.group, rank)
-                handles.append(
-                    dist.broadcast(tensor=bucket, src=global_rank,
-                                   group=self.group, async_op=True)
-                )
-        else:
-            for rank, param_groups in enumerate(self._partition_parameters()):
-                global_rank = _get_global_rank(self.group, rank)
-                for param_group in param_groups:
-                    for param in param_group["params"]:
-                        handles.append(
-                            dist.broadcast(tensor=param.data, src=global_rank,
-                                           group=self.group, async_op=True)
-                        )
-        _ = list(map(lambda x: x.wait(), handles))
+        self._sync_parameters()
 
         # Sync any updated attributes in the local optimizer to the exposed
         # `param_groups`
@@ -457,6 +493,15 @@ def step(
 
         return loss
 
+    def _join_hook(self):
+        r"""
+        Returns the ZeRO join hook, which enables training on uneven inputs by
+        shadowing the collective communications in the optimizer step.
+
+        Gradients must be properly set before this hook is called.
+        """
+        return _ZeROJoinHook(self)
+
     def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
         r"""
         Load the state pertaining to the given rank from the input
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 90a95335b5966..7a890f54fa0ba 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -9,21 +9,23 @@
 
 import torch
 import torch.distributed as dist
-from torch.autograd import Variable, Function
+from torch.autograd import Function, Variable
+from torch.distributed.algorithms.join import _Join, _JoinHook
 from torch.utils._pytree import tree_flatten, tree_unflatten
 
 RPC_AVAILABLE = False
 if dist.is_available():
-    from torch.distributed.distributed_c10d import ReduceOp
-    from torch.distributed.distributed_c10d import _get_default_group
+    from torch.distributed.distributed_c10d import ReduceOp, _get_default_group
 if torch.distributed.rpc.is_available():
     RPC_AVAILABLE = True
     from torch.distributed.rpc import RRef
+
 from torch._utils import _get_device_index
 
 from ..modules import Module
 from ._functions import _get_stream
-from .scatter_gather import scatter_kwargs, gather, is_namedtuple
+from .scatter_gather import gather, is_namedtuple, scatter_kwargs
+
 
 def _tree_flatten_with_rref(output):
     output_is_rref = RPC_AVAILABLE and isinstance(output, RRef)
@@ -173,6 +175,76 @@ def backward(ctx, *grad_outputs):
 
         return (None, None, *grad_outputs)
 
+
+class _DDPJoinHook(_JoinHook):
+    def __init__(self, ddp, divide_by_initial_world_size, enable, throw_on_early_termination):
+        """
+        Sets config variables for internal usage.
+        """
+        assert isinstance(ddp, DistributedDataParallel), \
+            "DDP join hook requires passing in a DistributedDataParallel " \
+            "instance as the state"
+        ddp.logger._set_uneven_input_join()
+        ddp.ddp_uneven_inputs_config = \
+            _DDPUnevenInputsConfig(
+                ddp_join_enabled=enable,
+                ddp_join_divide_by_initial_world_size=divide_by_initial_world_size,
+                ddp_join_throw_on_early_termination=throw_on_early_termination,
+            )
+        self.ddp = ddp
+        super().__init__()
+
+    def main_hook(self):
+        """
+        Shadows the DDP collective communication operations in the forward and
+        backward passes.
+        """
+        ddp = self.ddp
+        # Buckets are rebuilt only once during a training period
+        ddp.reducer._rebuild_buckets()
+
+        # Schedule a broadcast if we are syncing module buffers in the
+        # forward pass
+        ddp._check_and_sync_module_buffers()
+
+        # Check if need to sync in the backward pass
+        work = ddp._check_global_requires_backward_grad_sync(is_joined_rank=True)
+        work.wait()
+        should_sync_backwards = work.result()[0].item() != 0
+        # Forward parameter sync is disabled in the next iteration if we
+        # are skipping gradient sync this iteration, so set
+        # `require_forward_param_sync` accordingly
+        ddp.require_forward_param_sync = should_sync_backwards
+        if not should_sync_backwards:
+            return
+
+        # Schedule one allreduce per gradient bucket to match the backward
+        # pass allreduce
+        ddp._match_all_reduce_for_bwd_pass()
+
+        # Check if we need to allreduce locally unused parameters
+        if ddp.find_unused_parameters:
+            ddp._match_unused_params_allreduce()
+
+        # Rebuilt parameters are pushed only once during a training period
+        ddp.reducer._push_all_rebuilt_params()
+
+    def post_hook(self, is_last_joiner: bool):
+        """
+        Syncs the final model to ensure that the model is the same across all
+        processes.
+        """
+        self.ddp._sync_final_model(is_last_joiner)
+
+    @property
+    def device(self):
+        return self.ddp.device
+
+    @property
+    def process_group(self):
+        return self.ddp.process_group
+
+
 class DistributedDataParallel(Module):
     r"""Implements distributed data parallelism that is based on
     ``torch.distributed`` package at the module level.
@@ -1041,12 +1113,11 @@ def _match_unused_params_allreduce(self):
         locally_used_param_maps = self.reducer._get_local_used_maps()
         self.process_group.allreduce(locally_used_param_maps)
 
-    @contextmanager
     def join(
         self,
-        divide_by_initial_world_size=True,
-        enable=True,
-        throw_on_early_termination=False,
+        divide_by_initial_world_size: bool = True,
+        enable: bool = True,
+        throw_on_early_termination: bool = False,
     ):
         r"""
         A context manager to be used in conjunction with an instance of
@@ -1142,87 +1213,32 @@ def join(
           >>>  # blocking for rank 1's allreduce to complete.
           >>>  torch.cuda.synchronize(device=rank)
         """
-        # Log uneven input API usage.
-        self.logger._set_uneven_input_join()
-        try:
-            has_error = False
-            self.ddp_uneven_inputs_config = _DDPUnevenInputsConfig(
-                ddp_join_enabled=enable,
-                ddp_join_divide_by_initial_world_size=divide_by_initial_world_size,
-                ddp_join_throw_on_early_termination=throw_on_early_termination,
+        join_hooks = [
+            self._join_hook(
+                divide_by_initial_world_size=divide_by_initial_world_size,
+                enable=enable,
+                throw_on_early_termination=throw_on_early_termination,
             )
-            yield
-        except Exception as e:
-            # Set to skip any processing in the finally block.
-            has_error = True
-            raise e
-        finally:
-            # Skip any processing to let the exception immediately be raised if
-            # there was one.
-            if enable and not has_error:
-                all_procs_joined = False
-                is_last_joiner = True
-                i = 0
-                WARN_THRESHOLD = 1000
-                warnings.simplefilter("once")
-                while not all_procs_joined:
-                    if i > WARN_THRESHOLD:
-                        my_rank = self._distributed_rank
-                        warnings.warn(
-                            "Detected uneven input skew of greater "
-                            f"than {WARN_THRESHOLD}. This means that rank {my_rank} "
-                            f"has at least {WARN_THRESHOLD} fewer inputs than "
-                            "other currently active ranks. This level of skew could "
-                            "lead to performance degradation during training."
-                        )
-                    # Schedules allreduce to match fwd pass allreduce in non-joined procs
-                    num_active_procs = self._schedule_shadow_all_reduce_for_fwd_pass()
-                    if num_active_procs == 0:
-                        all_procs_joined = True
-                    else:
-                        # Some DDP process still needs to be joined.
-                        if self.ddp_uneven_inputs_config.ddp_join_throw_on_early_termination:
-                            # Schedule allreduce telling active ranks to terminate
-                            ones = torch.ones(1, device=self.device)
-                            dist.all_reduce(ones, group=self.process_group)
-                            # Raising StopIteration doesn't throw error in python 3.6
-                            # and throws RuntimeError in 3.7+ (PEP 479), so just
-                            # raise RuntimeError here.
-                            raise RuntimeError(
-                                f"Rank {self._distributed_rank} exhausted all inputs."
-                            )
-                        if is_last_joiner:
-                            is_last_joiner = False
-                        # It will rebuild buckets only once during training period
-                        self.reducer._rebuild_buckets()
-                        # Schedule a corresponding broadcast if we are syncing module
-                        # buffers in the forward pass.
-                        self._check_and_sync_module_buffers()
-
-                        work = self._check_global_requires_backward_grad_sync(
-                            is_joined_rank=True
-                        )
-                        work.wait()
-                        # If nonzero, then we should sync in the bwd pass.
-                        should_sync_backwards = work.result()[0].item() != 0
-                        # Forward param sync is disabled in the next iteration
-                        # if we are skipping grad sync this iteration. Hence, we
-                        # set require_forward_param_sync appropriately here.
-                        self.require_forward_param_sync = should_sync_backwards
-                        if not should_sync_backwards:
-                            continue
-                        # Schedules one allreduce per gradient bucket to match
-                        # the backwards pass allreduce.
-                        self._match_all_reduce_for_bwd_pass()
-                        # Check if we need to allreduce locally unused params.
-                        if self.find_unused_parameters:
-                            self._match_unused_params_allreduce()
-                        # It will push rebuilt params only once during training period
-                        self.reducer._push_all_rebuilt_params()
-                        i += 1
-
-                # All procs joined. Agree on authoritative rank and broadcast the model.
-                self._sync_final_model(is_last_joiner)
+        ]
+        return _Join(join_hooks, enable, throw_on_early_termination)
+
+    def _join_hook(
+        self,
+        divide_by_initial_world_size: bool = True,
+        enable: bool = True,
+        throw_on_early_termination: bool = False,
+    ):
+        r"""
+        Returns the DDP join hook, which enables training on uneven inputs by
+        shadowing the collective communications in the forward and backward
+        passes.
+        """
+        return _DDPJoinHook(
+            self,
+            divide_by_initial_world_size=divide_by_initial_world_size,
+            enable=enable,
+            throw_on_early_termination=throw_on_early_termination
+        )
 
     def register_comm_hook(self, state: object, hook: callable):
         r"""

From 5e9bcf910180cbee048deee180e3d293a3622382 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Fri, 9 Jul 2021 08:35:59 -0700
Subject: [PATCH 014/122] fix: support removing hook in the hook (#61250)

Summary:
Fixes: https://github.com/pytorch/pytorch/issues/58354

Problem:
Once a hook is called
https://github.com/pytorch/pytorch/blob/05c1e5b65571d2fa14b0f06c37ba970fd7cc43d0/torch/csrc/autograd/python_hook.cpp#L51-L54

If the hook has `handle.remove()` while executing and if there are no references to the hook function object then `python` is free to garbage collect.

At the subsequent call to
https://github.com/pytorch/pytorch/blob/05c1e5b65571d2fa14b0f06c37ba970fd7cc43d0/torch/csrc/autograd/python_hook.cpp#L54

we have `hook` pointing to invalid memory

Thus when we try to fetch the name for `hook` from `check_single_result` with
https://github.com/pytorch/pytorch/blob/05c1e5b65571d2fa14b0f06c37ba970fd7cc43d0/torch/csrc/autograd/python_hook.cpp#L175-L177
we get segfault.

Solution:
Temporarily increase the life-time of hook with `Py_INCREF` till we have verified the result.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61250

Reviewed By: iramazanli

Differential Revision: D29623826

Pulled By: soulitzer

fbshipit-source-id: c71322311f19066cafb7203980668868c59d4e5e
---
 test/test_torch.py                  | 30 +++++++++++++++++++++++++++++
 torch/csrc/autograd/python_hook.cpp | 26 +++++++++++++++++--------
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index d96161c347304..f1bb1c9a4c73b 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -7704,6 +7704,36 @@ def get_dtype(scalar_type):
         # Reset the original dtype
         torch.set_default_dtype(default_dtype)
 
+    def test_hook_remove(self, device):
+        # Reference: https://github.com/pytorch/pytorch/issues/58354
+        def _test_helper(remove_hook):
+            def install_hook(tensor):
+                handle = None
+
+                def hook(tensor):
+                    if remove_hook:
+                        handle.remove()
+                    return torch.zeros_like(tensor)
+                handle = tensor.register_hook(hook)
+
+            t = torch.ones((1, 5), device=device, requires_grad=True)
+            install_hook(t)
+
+            # First call to backward
+            t.mean().backward()
+            self.assertEqual(t.grad, torch.zeros_like(t))
+
+            # Second call to backward
+            t.mean().backward()
+            if remove_hook:
+                # After removing the hook, make sure the usual gradient is returned
+                self.assertEqual(t.grad, 0.2 * torch.ones_like(t))
+            else:
+                self.assertEqual(t.grad, torch.zeros_like(t))
+
+        _test_helper(remove_hook=True)
+        _test_helper(remove_hook=False)
+
 # Tests that compare a device's computation with the (gold-standard) CPU's.
 class TestDevicePrecision(TestCase):
     exact_dtype = True
diff --git a/torch/csrc/autograd/python_hook.cpp b/torch/csrc/autograd/python_hook.cpp
index d022f18ffaff7..5e30bc040273e 100644
--- a/torch/csrc/autograd/python_hook.cpp
+++ b/torch/csrc/autograd/python_hook.cpp
@@ -44,10 +44,19 @@ auto PyFunctionPreHook::operator()(const variable_list& values) -> variable_list
   THPObjectPtr value(THPVariable_Wrap(values.at(value_idx)));
   if (!value) throw python_error();
 
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  PyObject *key, *hook;
-  Py_ssize_t pos = 0;
-  while (PyDict_Next(dict, &pos, &key, &hook)) {
+  // Note: [Extend Hook Lifetime]
+  // Hold a reference to hooks till we iterate over them.
+  // This is to handle the case when hook calls `handle.remove` inside it
+  // and it's refcount goes to `0`, Python is free to GC it.
+  // We hold onto a stale pointer and subsequent call to
+  // `check_single_result`, which tries to fetch the `hook`'s name segfaults.
+  // So, we use `PyDict_Values` which returns a new reference to the values
+  // i.e. we hold the reference to the hooks till we have iterated over them.
+  // Reference: https://github.com/pytorch/pytorch/issues/58354
+  auto hooks = THPObjectPtr{PyDict_Values(dict)};
+  const auto len = PyList_Size(hooks);
+  for (Py_ssize_t idx = 0; idx < len; ++idx) {
+    const auto hook = PyList_GetItem(hooks, idx);
     THPObjectPtr res(PyObject_CallFunctionObjArgs(hook, value.get(), nullptr));
     if (!res) throw python_error();
     if (res == Py_None) continue;
@@ -81,10 +90,11 @@ auto PyFunctionPostHook::operator()(
   THPObjectPtr outputs(wrap_variables(_outputs));
   THPObjectPtr inputs(wrap_variables(_inputs));
 
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  PyObject *key, *hook;
-  Py_ssize_t pos = 0;
-  while (PyDict_Next(dict, &pos, &key, &hook)) {
+  // See Note: [Extend Hook Lifetime]
+  auto hooks = THPObjectPtr{PyDict_Values(dict)};
+  const auto len = PyList_Size(hooks);
+  for (Py_ssize_t idx = 0; idx < len; ++idx) {
+    const auto hook = PyList_GetItem(hooks, idx);
     THPObjectPtr res(PyObject_CallFunctionObjArgs(
         hook, outputs.get(), inputs.get(), nullptr));
     if (!res) throw python_error();

From c966ce693390872f64421dc456f3e07333812e5e Mon Sep 17 00:00:00 2001
From: Xiao Wang <24860335+xwang233@users.noreply.github.com>
Date: Fri, 9 Jul 2021 08:40:28 -0700
Subject: [PATCH 015/122] Fix several test_ops cuda dtypes tests (#60922)

Summary:
Close https://github.com/pytorch/pytorch/issues/60443

Pull Request resolved: https://github.com/pytorch/pytorch/pull/60922

Reviewed By: jdonald, iramazanli

Differential Revision: D29630122

Pulled By: mruberry

fbshipit-source-id: 441f79828860282e5849a2565facf9e7f72912e8
---
 torch/testing/_internal/common_cuda.py        |  1 +
 .../_internal/common_methods_invocations.py   | 21 +++++++------------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 60195a1eaa22c..5d0849bb8407d 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -19,6 +19,7 @@
 CUDA11OrLater = torch.version.cuda and distutils.version.LooseVersion(torch.version.cuda) >= "11.0"
 CUDA9 = torch.version.cuda and torch.version.cuda.startswith('9.')
 SM53OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (5, 3)
+SM60OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (6, 0)
 
 TEST_MAGMA = TEST_CUDA
 if TEST_CUDA:
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 94960be5186f2..d8eea09102ef0 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -23,7 +23,7 @@
 from torch.testing._internal.common_device_type import \
     (skipIf, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfNoCusolver,
      skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIfRocm, precisionOverride, toleranceOverride, tol)
-from torch.testing._internal.common_cuda import CUDA11OrLater, SM53OrLater
+from torch.testing._internal.common_cuda import CUDA11OrLater, SM53OrLater, SM60OrLater
 from torch.testing._internal.common_utils import \
     (is_iterable_of_tensors,
      random_symmetric_matrix, random_symmetric_psd_matrix,
@@ -6104,13 +6104,11 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            dtypesIfCPU=all_types_and_complex(),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
            dtypesIfROCM=floating_types_and(torch.half, torch.bfloat16),
-           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16),
+           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                                *[torch.bfloat16] if (SM60OrLater and CUDA11OrLater) else []),
            assert_autodiffed=True,
            sample_inputs_func=sample_inputs_matmul,
            skips=(
-               # FIXME: bfloat16 backward support likely depends on CUDA11+
-               #   and SM53+
-               SkipInfo('TestCommon', 'test_dtypes', active_if=IS_WINDOWS),
                # matmul does not correctly warn when resizing out= inputs
                SkipInfo('TestCommon', 'test_out'),
                SkipInfo('TestCommon', 'test_conj_view', device_type='cpu'),
@@ -6627,14 +6625,13 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            dtypesIfCPU=all_types_and_complex(),
            dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else [],
                                            torch.complex64, torch.complex128),
-           backward_dtypesIfCUDA=floating_types_and(torch.float16, torch.complex64, torch.complex128),
+           backward_dtypesIfCUDA=floating_types_and(torch.float16,
+                                                    *[torch.bfloat16] if (SM60OrLater and CUDA11OrLater) else [],
+                                                    torch.complex64, torch.complex128),
            assert_autodiffed=True,
            sample_inputs_func=sample_inputs_matmul,
            supports_out=False,
            skips=(
-               # FIXME: bfloat16 backward support likely depends on CUDA11+
-               #   and SM53+
-               SkipInfo('TestCommon', 'test_dtypes', active_if=IS_WINDOWS),
                SkipInfo('TestJit', 'test_variant_consistency_jit',),
            )),
     OpInfo('__rmod__',
@@ -6986,13 +6983,11 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            op=lambda tensors, equation: torch.einsum(equation, tensors),
            dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.half, *[torch.bfloat16] if CUDA11OrLater else []),
-           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.half),
+           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.half,
+                                                                *[torch.bfloat16] if (SM60OrLater and CUDA11OrLater) else []),
            supports_out=False,
            sample_inputs_func=sample_inputs_einsum,
            skips=(
-               # FIXME: bfloat16 backward support likely depends on CUDA11+
-               #   and SM53+
-               SkipInfo('TestCommon', 'test_dtypes', active_if=IS_WINDOWS),
                # test does not work with passing lambda for op
                # there's a test `test_einsum` in `test_jit.py` to handle this case
                SkipInfo('TestJit', 'test_variant_consistency_jit'),

From e9a40de1af2ff6374f2fcbc19f58064784785f86 Mon Sep 17 00:00:00 2001
From: Sam Estep <sestep@fb.com>
Date: Fri, 9 Jul 2021 08:48:12 -0700
Subject: [PATCH 016/122] Add other Linux GPU auxiliary test jobs (#61055)

Summary:
- [x] add the jobs to the matrix
  - [x] `jit_legacy`
  - [x] `nogpu_NO_AVX`
  - [x] `nogpu_NO_AVX2`
  - [x] `slow`
- [x] use the test config properly to enable the different test conditions
- [x] validate that it works
- [x] disable on pull requests before merging

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61055

Test Plan: CI. Example run: https://github.com/pytorch/pytorch/actions/runs/1013240987

Reviewed By: walterddr

Differential Revision: D29594080

Pulled By: samestep

fbshipit-source-id: 02c531ebc42feae81ecaea0785915f95e0f53ed7
---
 .github/scripts/generate_ci_workflows.py             | 12 ++++++++++++
 .github/scripts/generate_pytorch_test_matrix.py      | 10 ++++++++++
 .github/templates/linux_ci_workflow.yml.j2           |  9 ++++++++-
 ...torch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7.yml |  9 ++++++++-
 .../pytorch-linux-bionic-py3.8-gcc9-coverage.yml     |  9 ++++++++-
 ...torch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7.yml |  9 ++++++++-
 ...torch-linux-xenial-cuda11.1-cudnn8-py3.6-gcc7.yml |  9 ++++++++-
 .../workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml  |  9 ++++++++-
 .jenkins/pytorch/test.sh                             | 10 +++++-----
 9 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 4ac3243a6fdf2..056dabeaaa4de 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -54,7 +54,11 @@ def PyTorchLinuxWorkflow(
     test_runner_type: str,
     on_pull_request: bool = False,
     enable_doc_jobs: bool = False,
+    enable_jit_legacy_test: YamlShellBool = "''",
     enable_multigpu_test: YamlShellBool = "''",
+    enable_nogpu_no_avx_test: YamlShellBool = "''",
+    enable_nogpu_no_avx2_test: YamlShellBool = "''",
+    enable_slow_test: YamlShellBool = "''",
     num_test_shards: int = 1,
     is_scheduled: Optional[str] = None,
 ) -> PyTorchWorkflow:
@@ -65,7 +69,11 @@ def PyTorchLinuxWorkflow(
         "on_pull_request": on_pull_request,
         "is_scheduled": is_scheduled,
         "enable_doc_jobs": enable_doc_jobs,
+        "enable_jit_legacy_test": enable_jit_legacy_test,
         "enable_multigpu_test": enable_multigpu_test,
+        "enable_nogpu_no_avx_test": enable_nogpu_no_avx_test,
+        "enable_nogpu_no_avx2_test": enable_nogpu_no_avx2_test,
+        "enable_slow_test": enable_slow_test,
         "num_test_shards": num_test_shards,
     }
 
@@ -163,7 +171,11 @@ def generate_workflow_file(
         build_environment="pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7",
         docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
         test_runner_type=LINUX_CUDA_TEST_RUNNER,
+        enable_jit_legacy_test=1,
         enable_multigpu_test=1,
+        enable_nogpu_no_avx_test=1,
+        enable_nogpu_no_avx2_test=1,
+        enable_slow_test=1,
         num_test_shards=2,
     ),
     PyTorchLinuxWorkflow(
diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py
index 4edb3e92eecc6..70707c4e99f6f 100755
--- a/.github/scripts/generate_pytorch_test_matrix.py
+++ b/.github/scripts/generate_pytorch_test_matrix.py
@@ -21,11 +21,21 @@ class Config(TypedDict):
 
 def main() -> None:
     TEST_RUNNER_TYPE = os.getenv('TEST_RUNNER_TYPE')
+    assert TEST_RUNNER_TYPE is not None
     NUM_TEST_SHARDS = int(os.getenv('NUM_TEST_SHARDS', '1'))
     MULTIGPU_RUNNER_TYPE = os.getenv('MULTIGPU_RUNNER_TYPE')
+    NOGPU_RUNNER_TYPE = os.getenv('NOGPU_RUNNER_TYPE')
     configs: Dict[str, Config] = {}
+    if os.getenv('ENABLE_JIT_LEGACY_TEST'):
+        configs['jit_legacy'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     if MULTIGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_MULTIGPU_TEST'):
         configs['multigpu'] = {'num_shards': 1, 'runner': MULTIGPU_RUNNER_TYPE}
+    if NOGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_NOGPU_NO_AVX_TEST'):
+        configs['nogpu_NO_AVX'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
+    if NOGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_NOGPU_NO_AVX2_TEST'):
+        configs['nogpu_NO_AVX2'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
+    if os.getenv('ENABLE_SLOW_TEST'):
+        configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     matrix = {
         'include': [
             {
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index b6a67e866fb81..1fcac6b7e5cb1 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -206,9 +206,14 @@ jobs:
     runs-on: ubuntu-18.04
     env:
       TEST_RUNNER_TYPE: !{{ test_runner_type }}
+      ENABLE_JIT_LEGACY_TEST: !{{ enable_jit_legacy_test }}
       ENABLE_MULTIGPU_TEST: !{{ enable_multigpu_test }}
+      ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }}
+      ENABLE_NOGPU_NO_AVX2_TEST: !{{ enable_nogpu_no_avx2_test }}
+      ENABLE_SLOW_TEST: !{{ enable_slow_test }}
       NUM_TEST_SHARDS: !{{ num_test_shards }}
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      NOGPU_RUNNER_TYPE: linux.2xlarge
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
@@ -256,7 +261,7 @@ jobs:
         run: |
           docker pull "${DOCKER_IMAGE}"
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
         run: |
           bash .github/scripts/install_nvidia_utils_linux.sh
           echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
@@ -306,6 +311,8 @@ jobs:
             -e IN_CI \
             -e SHARD_NUMBER \
             -e JOB_BASE_NAME \
+            -e TEST_CONFIG \
+            -e NUM_TEST_SHARDS \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
diff --git a/.github/workflows/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7.yml b/.github/workflows/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7.yml
index 163e05b3b0b26..ae5cd95da46de 100644
--- a/.github/workflows/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7.yml
+++ b/.github/workflows/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7.yml
@@ -194,9 +194,14 @@ jobs:
     runs-on: ubuntu-18.04
     env:
       TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
+      ENABLE_NOGPU_NO_AVX_TEST: ''
+      ENABLE_NOGPU_NO_AVX2_TEST: ''
+      ENABLE_SLOW_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      NOGPU_RUNNER_TYPE: linux.2xlarge
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
@@ -244,7 +249,7 @@ jobs:
         run: |
           docker pull "${DOCKER_IMAGE}"
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
         run: |
           bash .github/scripts/install_nvidia_utils_linux.sh
           echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
@@ -294,6 +299,8 @@ jobs:
             -e IN_CI \
             -e SHARD_NUMBER \
             -e JOB_BASE_NAME \
+            -e TEST_CONFIG \
+            -e NUM_TEST_SHARDS \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
diff --git a/.github/workflows/pytorch-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/pytorch-linux-bionic-py3.8-gcc9-coverage.yml
index 5be6776189a72..46f51ad09f325 100644
--- a/.github/workflows/pytorch-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/pytorch-linux-bionic-py3.8-gcc9-coverage.yml
@@ -195,9 +195,14 @@ jobs:
     runs-on: ubuntu-18.04
     env:
       TEST_RUNNER_TYPE: linux.2xlarge
+      ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
+      ENABLE_NOGPU_NO_AVX_TEST: ''
+      ENABLE_NOGPU_NO_AVX2_TEST: ''
+      ENABLE_SLOW_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      NOGPU_RUNNER_TYPE: linux.2xlarge
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
@@ -245,7 +250,7 @@ jobs:
         run: |
           docker pull "${DOCKER_IMAGE}"
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
         run: |
           bash .github/scripts/install_nvidia_utils_linux.sh
           echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
@@ -295,6 +300,8 @@ jobs:
             -e IN_CI \
             -e SHARD_NUMBER \
             -e JOB_BASE_NAME \
+            -e TEST_CONFIG \
+            -e NUM_TEST_SHARDS \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
diff --git a/.github/workflows/pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7.yml b/.github/workflows/pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7.yml
index bf9baaca283f2..0ca67850462ba 100644
--- a/.github/workflows/pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7.yml
+++ b/.github/workflows/pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7.yml
@@ -194,9 +194,14 @@ jobs:
     runs-on: ubuntu-18.04
     env:
       TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      ENABLE_JIT_LEGACY_TEST: 1
       ENABLE_MULTIGPU_TEST: 1
+      ENABLE_NOGPU_NO_AVX_TEST: 1
+      ENABLE_NOGPU_NO_AVX2_TEST: 1
+      ENABLE_SLOW_TEST: 1
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      NOGPU_RUNNER_TYPE: linux.2xlarge
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
@@ -244,7 +249,7 @@ jobs:
         run: |
           docker pull "${DOCKER_IMAGE}"
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
         run: |
           bash .github/scripts/install_nvidia_utils_linux.sh
           echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
@@ -294,6 +299,8 @@ jobs:
             -e IN_CI \
             -e SHARD_NUMBER \
             -e JOB_BASE_NAME \
+            -e TEST_CONFIG \
+            -e NUM_TEST_SHARDS \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
diff --git a/.github/workflows/pytorch-linux-xenial-cuda11.1-cudnn8-py3.6-gcc7.yml b/.github/workflows/pytorch-linux-xenial-cuda11.1-cudnn8-py3.6-gcc7.yml
index 2469897a9967a..f4f6e60f4e04d 100644
--- a/.github/workflows/pytorch-linux-xenial-cuda11.1-cudnn8-py3.6-gcc7.yml
+++ b/.github/workflows/pytorch-linux-xenial-cuda11.1-cudnn8-py3.6-gcc7.yml
@@ -194,9 +194,14 @@ jobs:
     runs-on: ubuntu-18.04
     env:
       TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
+      ENABLE_NOGPU_NO_AVX_TEST: ''
+      ENABLE_NOGPU_NO_AVX2_TEST: ''
+      ENABLE_SLOW_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      NOGPU_RUNNER_TYPE: linux.2xlarge
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
@@ -244,7 +249,7 @@ jobs:
         run: |
           docker pull "${DOCKER_IMAGE}"
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
         run: |
           bash .github/scripts/install_nvidia_utils_linux.sh
           echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
@@ -294,6 +299,8 @@ jobs:
             -e IN_CI \
             -e SHARD_NUMBER \
             -e JOB_BASE_NAME \
+            -e TEST_CONFIG \
+            -e NUM_TEST_SHARDS \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
diff --git a/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
index 6f43ab844c790..49720b1e4ec11 100644
--- a/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
@@ -195,9 +195,14 @@ jobs:
     runs-on: ubuntu-18.04
     env:
       TEST_RUNNER_TYPE: linux.2xlarge
+      ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
+      ENABLE_NOGPU_NO_AVX_TEST: ''
+      ENABLE_NOGPU_NO_AVX2_TEST: ''
+      ENABLE_SLOW_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      NOGPU_RUNNER_TYPE: linux.2xlarge
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
@@ -245,7 +250,7 @@ jobs:
         run: |
           docker pull "${DOCKER_IMAGE}"
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
         run: |
           bash .github/scripts/install_nvidia_utils_linux.sh
           echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
@@ -295,6 +300,8 @@ jobs:
             -e IN_CI \
             -e SHARD_NUMBER \
             -e JOB_BASE_NAME \
+            -e TEST_CONFIG \
+            -e NUM_TEST_SHARDS \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 79ebdc9b13580..106dd098dccc3 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -23,7 +23,7 @@ export LANG=C.UTF-8
 #       see https://github.com/pytorch/pytorch/issues/60111
 IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-}
 
-if [[ "$BUILD_ENVIRONMENT" == *-slow-* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *-slow-* || $TEST_CONFIG == 'slow' ]]; then
   export PYTORCH_TEST_WITH_SLOW=1
   export PYTORCH_TEST_SKIP_FAST=1
 fi
@@ -129,9 +129,9 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
     (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
 fi
 
-if [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX-* ]]; then
+if [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX-* || $TEST_CONFIG == 'nogpu_NO_AVX' ]]; then
   export ATEN_CPU_CAPABILITY=default
-elif [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX2-* ]]; then
+elif [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX2-* || $TEST_CONFIG == 'nogpu_NO_AVX2' ]]; then
   export ATEN_CPU_CAPABILITY=avx
 fi
 
@@ -385,7 +385,7 @@ test_bazel() {
 }
 
 test_benchmarks() {
-  if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$BUILD_ENVIRONMENT" != *nogpu* ]]; then
+  if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$BUILD_ENVIRONMENT" != *nogpu* && $TEST_CONFIG != *nogpu* ]]; then
     pip_install --user "pytest-benchmark==3.2.3"
     pip_install --user "requests"
     BENCHMARK_DATA="benchmarks/.data"
@@ -442,7 +442,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *backward* ]]; then
 elif [[ "${BUILD_ENVIRONMENT}" == *xla* || "${JOB_BASE_NAME}" == *xla* ]]; then
   install_torchvision
   test_xla
-elif [[ "${BUILD_ENVIRONMENT}" == *jit_legacy-test || "${JOB_BASE_NAME}" == *jit_legacy-test ]]; then
+elif [[ "${BUILD_ENVIRONMENT}" == *jit_legacy-test || "${JOB_BASE_NAME}" == *jit_legacy-test || $TEST_CONFIG == 'jit_legacy' ]]; then
   test_python_legacy_jit
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests

From 2bbcc80de3e2c69c332b696749352fc8997f3040 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Fri, 9 Jul 2021 08:58:20 -0700
Subject: [PATCH 017/122] Enable disabling test cases on specific platforms
 (#61427)

Summary:
This adds functionality to our common_utils.py to allow disabling test cases for platforms Mac, Windows, and Linux.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61427

Test Plan:
CI should not change as no issues currently have the line "Platforms:..."

I tested locally by making sure `test_async_script` is skipped while running `python test/test_jit.py -k TestAsync.test_async_script` with a cached modified `.pytorch-disabled-tests.json`:
```
{
  "total_count": 32,
  "incomplete_results": false,
  "items": [
    {
      "url": "https://api.github.com/repos/pytorch/pytorch/issues/60652",
      "repository_url": "https://api.github.com/repos/pytorch/pytorch",
      "labels_url": "https://api.github.com/repos/pytorch/pytorch/issues/60652/labels{/name}",
      "comments_url": "https://api.github.com/repos/pytorch/pytorch/issues/60652/comments",
      "events_url": "https://api.github.com/repos/pytorch/pytorch/issues/60652/events",
      "html_url": "https://github.com/pytorch/pytorch/issues/60652",
      "id": 929288995,
      "node_id": "MDU6SXNzdWU5MjkyODg5OTU=",
      "number": 60652,
      "title": "DISABLED test_async_script (jit.test_async.TestAsync)",
      "user": {
        "login": "ezyang",
        "id": 13564,
        "node_id": "MDQ6VXNlcjEzNTY0",
        "avatar_url": "https://avatars.githubusercontent.com/u/13564?v=4",
        "gravatar_id": "",
        "url": "https://api.github.com/users/ezyang",
        "html_url": "https://github.com/ezyang",
        "followers_url": "https://api.github.com/users/ezyang/followers",
        "following_url": "https://api.github.com/users/ezyang/following{/other_user}",
        "gists_url": "https://api.github.com/users/ezyang/gists{/gist_id}",
        "starred_url": "https://api.github.com/users/ezyang/starred{/owner}{/repo}",
        "subscriptions_url": "https://api.github.com/users/ezyang/subscriptions",
        "organizations_url": "https://api.github.com/users/ezyang/orgs",
        "repos_url": "https://api.github.com/users/ezyang/repos",
        "events_url": "https://api.github.com/users/ezyang/events{/privacy}",
        "received_events_url": "https://api.github.com/users/ezyang/received_events",
        "type": "User",
        "site_admin": false
      },
      "labels": [
        {
          "id": 1301397902,
          "node_id": "MDU6TGFiZWwxMzAxMzk3OTAy",
          "url": "https://api.github.com/repos/pytorch/pytorch/labels/module:%20flaky-tests",
          "name": "module: flaky-tests",
          "color": "f7e101",
          "default": false,
          "description": "Problem is a flaky test in CI"
        },
        {
          "id": 679953883,
          "node_id": "MDU6TGFiZWw2Nzk5NTM4ODM=",
          "url": "https://api.github.com/repos/pytorch/pytorch/labels/oncall:%20distributed",
          "name": "oncall: distributed",
          "color": "f7e101",
          "default": false,
          "description": "Add this issue/PR to distributed oncall triage queue"
        }
      ],
      "state": "open",
      "locked": false,
      "assignee": {
        "login": "rohan-varma",
        "id": 8039770,
        "node_id": "MDQ6VXNlcjgwMzk3NzA=",
        "avatar_url": "https://avatars.githubusercontent.com/u/8039770?v=4",
        "gravatar_id": "",
        "url": "https://api.github.com/users/rohan-varma",
        "html_url": "https://github.com/rohan-varma",
        "followers_url": "https://api.github.com/users/rohan-varma/followers",
        "following_url": "https://api.github.com/users/rohan-varma/following{/other_user}",
        "gists_url": "https://api.github.com/users/rohan-varma/gists{/gist_id}",
        "starred_url": "https://api.github.com/users/rohan-varma/starred{/owner}{/repo}",
        "subscriptions_url": "https://api.github.com/users/rohan-varma/subscriptions",
        "organizations_url": "https://api.github.com/users/rohan-varma/orgs",
        "repos_url": "https://api.github.com/users/rohan-varma/repos",
        "events_url": "https://api.github.com/users/rohan-varma/events{/privacy}",
        "received_events_url": "https://api.github.com/users/rohan-varma/received_events",
        "type": "User",
        "site_admin": false
      },
      "assignees": [
        {
          "login": "rohan-varma",
          "id": 8039770,
          "node_id": "MDQ6VXNlcjgwMzk3NzA=",
          "avatar_url": "https://avatars.githubusercontent.com/u/8039770?v=4",
          "gravatar_id": "",
          "url": "https://api.github.com/users/rohan-varma",
          "html_url": "https://github.com/rohan-varma",
          "followers_url": "https://api.github.com/users/rohan-varma/followers",
          "following_url": "https://api.github.com/users/rohan-varma/following{/other_user}",
          "gists_url": "https://api.github.com/users/rohan-varma/gists{/gist_id}",
          "starred_url": "https://api.github.com/users/rohan-varma/starred{/owner}{/repo}",
          "subscriptions_url": "https://api.github.com/users/rohan-varma/subscriptions",
          "organizations_url": "https://api.github.com/users/rohan-varma/orgs",
          "repos_url": "https://api.github.com/users/rohan-varma/repos",
          "events_url": "https://api.github.com/users/rohan-varma/events{/privacy}",
          "received_events_url": "https://api.github.com/users/rohan-varma/received_events",
          "type": "User",
          "site_admin": false
        }
      ],
      "milestone": null,
      "comments": 0,
      "created_at": "2021-06-24T14:28:33Z",
      "updated_at": "2021-06-24T16:40:42Z",
      "closed_at": null,
      "author_association": "CONTRIBUTOR",
      "active_lock_reason": null,
      "body": "Platforms:Mac, windows, Linux\r\n```\r\nJun 24 00:59:14 ======================================================================\r\nJun 24 00:59:14 ERROR [0.477s]: test_async_script (__main__.ProcessGroupGlooWrapperTest)\r\nJun 24 00:59:14 ----------------------------------------------------------------------\r\nJun 24 00:59:14 Traceback (most recent call last):\r\nJun 24 00:59:14   File \"/opt/conda/lib/python3.6/site-packages/torch/testing/_internal/common_distributed.py\", line 398, in wrapper\r\nJun 24 00:59:14     self._join_processes(fn)\r\nJun 24 00:59:14   File \"/opt/conda/lib/python3.6/site-packages/torch/testing/_internal/common_distributed.py\", line 590, in _join_processes\r\nJun 24 00:59:14     self._check_return_codes(elapsed_time)\r\nJun 24 00:59:14   File \"/opt/conda/lib/python3.6/site-packages/torch/testing/_internal/common_distributed.py\", line 633, in _check_return_codes\r\nJun 24 00:59:14     raise RuntimeError(error)\r\nJun 24 00:59:14 RuntimeError: Process 0 exited with error code 10 and exception:\r\nJun 24 00:59:14 RuntimeError: [/var/lib/jenkins/workspace/third_party/gloo/gloo/transport/tcp/pair.cc:598] Connection closed by peer [172.17.0.2]:21400\r\nJun 24 00:59:14 \r\nJun 24 00:59:14 During handling of the above exception, another exception occurred:\r\nJun 24 00:59:14 \r\nJun 24 00:59:14 Traceback (most recent call last):\r\nJun 24 00:59:14   File \"/opt/conda/lib/python3.6/site-packages/torch/testing/_internal/common_distributed.py\", line 516, in run_test\r\nJun 24 00:59:14     getattr(self, test_name)()\r\nJun 24 00:59:14   File \"/opt/conda/lib/python3.6/site-packages/torch/testing/_internal/common_distributed.py\", line 400, in wrapper\r\nJun 24 00:59:14     fn()\r\nJun 24 00:59:14   File \"distributed/test_pg_wrapper.py\", line 270, in test_collective_hang\r\nJun 24 00:59:14     self._test_collective_hang(pg)\r\nJun 24 00:59:14   File \"distributed/test_pg_wrapper.py\", line 52, in _test_collective_hang\r\nJun 24 00:59:14     wrapper_pg.allreduce([tensor])\r\nJun 24 00:59:14   File \"/opt/conda/lib/python3.6/unittest/case.py\", line 217, in __exit__\r\nJun 24 00:59:14     expected_regex.pattern, str(exc_value)))\r\nJun 24 00:59:14   File \"/opt/conda/lib/python3.6/unittest/case.py\", line 135, in _raiseFailure\r\nJun 24 00:59:14     raise self.test_case.failureException(msg)\r\nJun 24 00:59:14 AssertionError: \"Ranks 1 failed to pass monitoredBarrier\" does not match \"[/var/lib/jenkins/workspace/third_party/gloo/gloo/transport/tcp/pair.cc:598] Connection closed by peer [172.17.0.2]:21400\"\r\n```\r\n\r\nhttps://www.internalfb.com/intern/opensource/ci/job/log/225221175921058/\n\ncc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23",
      "performed_via_github_app": null,
      "score": 0.0
    }
  ]
}
```

Reviewed By: iramazanli

Differential Revision: D29627799

Pulled By: janeyx99

fbshipit-source-id: 5ef79127cbe0055c4f41766048e66f98cf80d2c4
---
 torch/testing/_internal/common_utils.py | 28 ++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index d3df4f0f913ff..6991e9f893069 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -310,6 +310,7 @@ def run_tests(argv=UNITTEST_ARGS):
     else:
         unittest.main(argv=argv)
 
+IS_LINUX = sys.platform == "linux"
 IS_WINDOWS = sys.platform == "win32"
 IS_MACOS = sys.platform == "darwin"
 IS_PPC = platform.machine() == "ppc64le"
@@ -901,14 +902,21 @@ def check_disabled(test_name):
 
         def read_and_process():
             url = 'https://raw.githubusercontent.com/pytorch/test-infra/master/stats/disabled-tests.json'
-            contents = urlopen(url, timeout=1).read().decode('utf-8')
             the_response = fetch_and_cache(".pytorch-disabled-tests", url)
             for item in the_response['items']:
                 title = item['title']
                 key = 'DISABLED '
                 if title.startswith(key):
                     test_name = title[len(key):].strip()
-                    _disabled_test_from_issues[test_name] = item['html_url']
+                    body = item['body']
+                    platforms_to_skip = []
+                    key = 'platforms:'
+                    for line in body.splitlines():
+                        line = line.lower()
+                        if line.startswith(key):
+                            pattern = re.compile(r"^\s+|\s*,\s*|\s+$")
+                            platforms_to_skip.extend([x for x in pattern.split(line[len(key):]) if x])
+                    _disabled_test_from_issues[test_name] = (item['html_url'], platforms_to_skip)
 
         if not IS_SANDCASTLE and os.getenv("PYTORCH_RUN_DISABLED_TESTS", "0") != "1":
             try:
@@ -920,9 +928,19 @@ def read_and_process():
 
     if disabled_test_from_issues is not None:
         if test_name in disabled_test_from_issues:
-            raise unittest.SkipTest(
-                "Test is disabled because an issue exists disabling it: {}".format(disabled_test_from_issues[test_name]) +
-                " To enable set the environment variable PYTORCH_RUN_DISABLED_TESTS=1")
+            issue_url, platforms = disabled_test_from_issues[test_name]
+            platform_to_conditional: Dict = {
+                "mac": IS_MACOS,
+                "macos": IS_MACOS,
+                "windows": IS_WINDOWS,
+                "linux": IS_LINUX
+            }
+            if platforms == [] or any([platform_to_conditional[platform] for platform in platforms]):
+                raise unittest.SkipTest(
+                    f"Test is disabled because an issue exists disabling it: {issue_url}" +
+                    f" for {'all' if platforms == [] else ''}platform(s) {', '.join(platforms)}." +
+                    " To enable, set the environment variable PYTORCH_RUN_DISABLED_TESTS=1")
+
 
 # Acquires the comparison dtype, required since isclose
 # requires both inputs have the same dtype, and isclose is not supported

From c19adfff54d68069f9fe14532c24f62d1616aba9 Mon Sep 17 00:00:00 2001
From: zilinzhu <zilinzhu@tencent.com>
Date: Fri, 9 Jul 2021 09:07:38 -0700
Subject: [PATCH 018/122] [DataLoader] Introduce ConcatMapDataPipe functional
 datapipe (#61010)

Summary:
As part of https://github.com/pytorch/pytorch/issues/57031, this PR adds the ConcatMapDataPipe functional datapipe for the MapDataPipe class.

We may need to discuss how to treat the datapipes with no valid length. For now, I just use them as if they have infinite length and the `__getitem__` could not go pass them.

Thank you for your time on reviewing this~

cc ejguan

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61010

Reviewed By: soulitzer

Differential Revision: D29587679

Pulled By: ejguan

fbshipit-source-id: 5eb97fa727209bec6c534520057c64a78000626e
---
 test/test_datapipe.py                       | 16 ++++++++
 torch/utils/data/datapipes/map/__init__.py  |  4 +-
 torch/utils/data/datapipes/map/combining.py | 45 +++++++++++++++++++++
 3 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 torch/utils/data/datapipes/map/combining.py

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 1ae7f38207cb5..f42faf0cb942f 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -907,6 +907,22 @@ def test_picklable(self):
                 with self.assertRaises(AttributeError):
                     p = pickle.dumps(datapipe)
 
+    def test_concat_datapipe(self):
+        input_dp1 = MDP(range(10))
+        input_dp2 = MDP(range(5))
+
+        with self.assertRaisesRegex(ValueError, r"Expected at least one DataPipe"):
+            dp.map.Concat()
+
+        with self.assertRaisesRegex(TypeError, r"Expected all inputs to be `MapDataPipe`"):
+            dp.map.Concat(input_dp1, ())  # type: ignore[arg-type]
+
+        concat_dp = input_dp1.concat(input_dp2)
+        self.assertEqual(len(concat_dp), 15)
+        for index in range(15):
+            self.assertEqual(concat_dp[index], (list(range(10)) + list(range(5)))[index])
+        self.assertEqual(list(concat_dp), list(range(10)) + list(range(5)))
+
     def test_map_datapipe(self):
         arr = range(10)
         input_dp = MDP(arr)
diff --git a/torch/utils/data/datapipes/map/__init__.py b/torch/utils/data/datapipes/map/__init__.py
index 09fd7dc23abb0..b7609957baaa8 100644
--- a/torch/utils/data/datapipes/map/__init__.py
+++ b/torch/utils/data/datapipes/map/__init__.py
@@ -1,5 +1,7 @@
 # Functional DataPipe
 from torch.utils.data.datapipes.map.callable import MapMapDataPipe as Map
+from torch.utils.data.datapipes.map.combining import \
+    (ConcatMapDataPipe as Concat)
 
 
-__all__ = ["Map"]
+__all__ = ['Map', 'Concat']
diff --git a/torch/utils/data/datapipes/map/combining.py b/torch/utils/data/datapipes/map/combining.py
new file mode 100644
index 0000000000000..234d45382efe6
--- /dev/null
+++ b/torch/utils/data/datapipes/map/combining.py
@@ -0,0 +1,45 @@
+from torch.utils.data import MapDataPipe, functional_datapipe
+from typing import Sized, Tuple, TypeVar
+
+T_co = TypeVar('T_co', covariant=True)
+
+
+@functional_datapipe('concat')
+class ConcatMapDataPipe(MapDataPipe):
+    r""" :class:`ConcatMapDataPipe`.
+
+    Map DataPipe to concatenate multiple Map DataPipes.
+    The actual index of is the cumulative sum of source datapipes.
+    For example, if there are 2 source datapipes both with length 5,
+    index 0 to 4 of the resulting `ConcatMapDataPipe` would refer to
+    elements of the first datapipe, and 5 to 9 would refer to elements
+    of the second datapipe.
+    args:
+        datapipes: Map DataPipes being concatenated
+    """
+    datapipes: Tuple[MapDataPipe]
+    length: int
+
+    def __init__(self, *datapipes: MapDataPipe):
+        if len(datapipes) == 0:
+            raise ValueError("Expected at least one DataPipe, but got nothing")
+        if not all(isinstance(dp, MapDataPipe) for dp in datapipes):
+            raise TypeError("Expected all inputs to be `MapDataPipe`")
+        if not all(isinstance(dp, Sized) for dp in datapipes):
+            raise TypeError("Expected all inputs to be `Sized`")
+        self.datapipes = datapipes  # type: ignore[assignment]
+        self.length = -1
+
+    def __getitem__(self, index) -> T_co:
+        offset = 0
+        for dp in self.datapipes:
+            if index - offset < len(dp):
+                return dp[index - offset]
+            else:
+                offset += len(dp)
+        raise IndexError("Index {} is out of range.".format(index))
+
+    def __len__(self) -> int:
+        if self.length == -1:
+            self.length = sum(len(dp) for dp in self.datapipes)
+        return self.length

From 8bd3e52e005b6c7dad0d04b8912c8e2fd9245575 Mon Sep 17 00:00:00 2001
From: Akshit Khurana <axit@fb.com>
Date: Fri, 9 Jul 2021 09:24:55 -0700
Subject: [PATCH 019/122] Add conv2d transpose NNAPI converter (#59529)

Summary:
* Conv2d transpose support
* Quantize WIP

Pull Request resolved: https://github.com/pytorch/pytorch/pull/59529

Test Plan: pytest test/test_nnapi.py::TestNNAPI::test_conv2d_transpose

Reviewed By: anshuljain1

Differential Revision: D28926335

fbshipit-source-id: 8f90182f96cee0a13c4f38331d421e1e8ac618de
---
 test/test_nnapi.py                  | 45 +++++++++++++++++++++++++++++
 torch/backends/_nnapi/serializer.py | 19 +++++++-----
 2 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/test/test_nnapi.py b/test/test_nnapi.py
index 93c8e41308555..dbf00ba61362f 100644
--- a/test/test_nnapi.py
+++ b/test/test_nnapi.py
@@ -475,6 +475,51 @@ def test_conv2d(self):
                         limit=limit
                     )
 
+    def test_conv2d_transpose(self):
+        in_ch, out_ch, kernel = (5, 7, (2, 2))
+        input_dim = (4, 5, 3, 3)
+        inp = torch.randn(input_dim)
+        convert_dims = input_dim[:2] + (0, 0)
+
+        for kind in ["float", "float-nhwc", "quant", "quant-nhwc"]:
+            with self.subTest(kind):
+                model = torch.nn.ConvTranspose2d(in_ch, out_ch, kernel)
+                output_size = model(inp).numel()
+                atol_rtol = (0.0002, 0)
+                limit = None
+                convert_arg = torch.zeros(*convert_dims)
+
+                if "quant" in kind:
+                    # FIXME 'aten::slow_conv_transpose2d' with arguments from the 'QuantizedCPU' backend
+                    continue
+                    model = torch.nn.Sequential(model)
+                    model.eval()
+                    model.qconfig = torch.quantization.get_default_qconfig('qnnpack')
+                    model = torch.quantization.prepare(model)
+                    model(inp)
+                    model = torch.quantization.convert(model)
+                    inp = qpt(inp, 1.0 / 16, 128)
+                    # I've seen numerical differences between QNNPACK and NNAPI,
+                    # but never more than 1 quantum, and never more than ~1% of
+                    # the output in this test.
+                    atol_rtol = (1, 0)
+                    limit = output_size * 0.03
+                    convert_arg = qpt(convert_arg, 1.0 / 16, 128)
+
+                if "nhwc" in kind:
+                    inp = nhwc(inp)
+                    convert_arg = nhwc(convert_arg)
+
+                self.check(model, inp, atol_rtol=atol_rtol, limit=limit)
+                self.check(
+                    model,
+                    inp,
+                    convert_args=[convert_arg],
+                    atol_rtol=atol_rtol,
+                    limit=limit
+                )
+
+
     def test_qadd(self):
         func = torch.nn.quantized.QFunctional()
         func.scale = 0.5
diff --git a/torch/backends/_nnapi/serializer.py b/torch/backends/_nnapi/serializer.py
index a0d3f8030e36d..c8dae56f2877c 100644
--- a/torch/backends/_nnapi/serializer.py
+++ b/torch/backends/_nnapi/serializer.py
@@ -1658,10 +1658,11 @@ def add_qlinear(self, node):
 
         self.add_operation(NNAPI_OperationCode.FULLY_CONNECTED, inputs, outputs)
 
-    def get_optional_bias(self, jit_bias, weight_tensor):
+    def get_optional_bias(self, jit_bias, weight_tensor, transpose=False):
         ctype, value = self.get_constant_value(jit_bias)
         if ctype.kind() == "NoneType":
-            nnapi_bias_tensor = torch.zeros(weight_tensor.size()[0], dtype=weight_tensor.dtype)
+            bias_idx = 1 if transpose else 0
+            nnapi_bias_tensor = torch.zeros(weight_tensor.size()[bias_idx], dtype=weight_tensor.dtype)
             bias_id = self.add_tensor_operand_for_weight(nnapi_bias_tensor)
             bias_oper = self.operands[bias_id]
             return bias_id, bias_oper
@@ -1719,10 +1720,10 @@ def add_conv_underscore(self, node):
             _,
         ) = node.inputs()
 
-        # XXX check jit_transpose
 
         _, weight_tensor = self.get_constant_value(jit_weight, "TensorType")
-        bias_id, bias_oper = self.get_optional_bias(jit_bias, weight_tensor)
+        _, transpose = self.get_constant_value(jit_transpose)
+        bias_id, bias_oper = self.get_optional_bias(jit_bias, weight_tensor, transpose)
         args = self.get_conv_pool_args_2d_from_jit(
             weight_tensor.shape[2:4], jit_stride, jit_pad, jit_dilation, jit_groups)
 
@@ -1734,7 +1735,7 @@ def add_conv_underscore(self, node):
             weight_tensor,
             bias_id,
             args,
-            False,  # transpose
+            transpose,
             NNAPI_FuseCode.FUSED_NONE,
         )
 
@@ -1841,7 +1842,10 @@ def add_conv2d_common(
         if args.group == 1:
             # Full convolution
             depthwise = False
-            weight_permutation = (0, 2, 3, 1)
+            if transpose:
+                weight_permutation = (1, 2, 3, 0)
+            else:
+                weight_permutation = (0, 2, 3, 1)
         elif args.group == in_c:
             # Depthwise convolution
             depthwise = True
@@ -1883,8 +1887,7 @@ def add_conv2d_common(
             assert out_c == in_c
         else:
             # Full convolution
-            kern_nf, kern_h, kern_w, kern_d = weight_oper.shape
-            out_c = kern_nf
+            out_c, kern_h, kern_w, kern_d = weight_oper.shape
             assert kern_d == in_c
 
         assert out_c == bias_oper.shape[0]

From ee2dd35ef4d25df4911d17d64aeea3bd254960ce Mon Sep 17 00:00:00 2001
From: shmsong <shisong@umich.edu>
Date: Fri, 9 Jul 2021 09:26:04 -0700
Subject: [PATCH 020/122] Resolving native dependency and try_run for cross
 compile (#59764)

Summary:
This is a PR on build system that provides support for cross compiling on Jetson platforms.

The major change is:

1. Disable try runs for cross compiling in `COMPILER_WORKS`, `BLAS`, and `CUDA`. They will not be able to perform try run on a cross compile setup

Pull Request resolved: https://github.com/pytorch/pytorch/pull/59764

Reviewed By: soulitzer

Differential Revision: D29524363

Pulled By: malfet

fbshipit-source-id: f06d1ad30b704c9a17d77db686c65c0754db07b8
---
 cmake/MiscCheck.cmake        |  12 +++-
 cmake/Modules/FindBLAS.cmake | 116 ++++++++++++++++++++---------------
 cmake/public/cuda.cmake      |  52 ++++++++--------
 3 files changed, 102 insertions(+), 78 deletions(-)

diff --git a/cmake/MiscCheck.cmake b/cmake/MiscCheck.cmake
index e0286f5381066..d70eca36a72e6 100644
--- a/cmake/MiscCheck.cmake
+++ b/cmake/MiscCheck.cmake
@@ -46,9 +46,15 @@ if(NOT INTERN_BUILD_MOBILE)
   if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
     list(APPEND CMAKE_REQUIRED_FLAGS "-arch ${CMAKE_HOST_SYSTEM_PROCESSOR}")
   endif()
-  CHECK_C_SOURCE_RUNS("
-  int main() { return 0; }
-  " COMPILER_WORKS)
+  if(CMAKE_CROSSCOMPILING)
+    CHECK_C_SOURCE_COMPILES("
+    int main() { return 0; }
+    " COMPILER_WORKS)
+  else()
+    CHECK_C_SOURCE_RUNS("
+    int main() { return 0; }
+    " COMPILER_WORKS)
+  endif()
   if(NOT COMPILER_WORKS)
     # Force cmake to retest next time around
     unset(COMPILER_WORKS CACHE)
diff --git a/cmake/Modules/FindBLAS.cmake b/cmake/Modules/FindBLAS.cmake
index 5c0392c9df322..47c80b45f676f 100644
--- a/cmake/Modules/FindBLAS.cmake
+++ b/cmake/Modules/FindBLAS.cmake
@@ -298,56 +298,72 @@ IF (BLAS_LIBRARIES)
   if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
     list(APPEND CMAKE_REQUIRED_FLAGS "-arch ${CMAKE_HOST_SYSTEM_PROCESSOR}")
   endif()
-  SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
-  CHECK_C_SOURCE_RUNS("
-#include <stdlib.h>
-#include <stdio.h>
-float x[4] = { 1, 2, 3, 4 };
-float y[4] = { .1, .01, .001, .0001 };
-int four = 4;
-int one = 1;
-extern double sdot_();
-int main() {
-  int i;
-  double r = sdot_(&four, x, &one, y, &one);
-  exit((float)r != (float).1234);
-}" BLAS_F2C_DOUBLE_WORKS )
-  CHECK_C_SOURCE_RUNS("
-#include <stdlib.h>
-#include <stdio.h>
-float x[4] = { 1, 2, 3, 4 };
-float y[4] = { .1, .01, .001, .0001 };
-int four = 4;
-int one = 1;
-extern float sdot_();
-int main() {
-  int i;
-  double r = sdot_(&four, x, &one, y, &one);
-  exit((float)r != (float).1234);
-}" BLAS_F2C_FLOAT_WORKS )
-  IF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
-    MESSAGE(STATUS "This BLAS uses the F2C return conventions")
-    SET(BLAS_F2C TRUE)
-  ELSE (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
-    SET(BLAS_F2C FALSE)
-  ENDIF(BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
-  CHECK_C_SOURCE_RUNS("
-#include <stdlib.h>
-#include <stdio.h>
-float x[4] = { 1, 2, 3, 4 };
-float y[4] = { .1, .01, .001, .0001 };
-extern float cblas_sdot();
-int main() {
-  int i;
-  double r = cblas_sdot(4, x, 1, y, 1);
-  exit((float)r != (float).1234);
-}" BLAS_USE_CBLAS_DOT )
-  IF (BLAS_USE_CBLAS_DOT)
-    SET(BLAS_USE_CBLAS_DOT TRUE)
-  ELSE (BLAS_USE_CBLAS_DOT)
-    SET(BLAS_USE_CBLAS_DOT FALSE)
-  ENDIF(BLAS_USE_CBLAS_DOT)
-  SET(CMAKE_REQUIRED_LIBRARIES)
+
+# Set values through env variables if cross compiling
+  IF (CMAKE_CROSSCOMPILING)
+    IF("$ENV{PYTORCH_BLAS_F2C}" STREQUAL "ON")
+      SET(BLAS_F2C TRUE)
+    ELSE()
+      SET(BLAS_F2C FALSE)
+    ENDIF()
+
+    IF("$ENV{PYTORCH_BLAS_USE_CBLAS_DOT}" STREQUAL "ON")
+      SET(BLAS_USE_CBLAS_DOT TRUE)
+    ELSE()
+      SET(BLAS_USE_CBLAS_DOT FALSE)
+    ENDIF()
+  ELSE ()
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    CHECK_C_SOURCE_RUNS("
+  #include <stdlib.h>
+  #include <stdio.h>
+  float x[4] = { 1, 2, 3, 4 };
+  float y[4] = { .1, .01, .001, .0001 };
+  int four = 4;
+  int one = 1;
+  extern double sdot_();
+  int main() {
+    int i;
+    double r = sdot_(&four, x, &one, y, &one);
+    exit((float)r != (float).1234);
+  }" BLAS_F2C_DOUBLE_WORKS )
+    CHECK_C_SOURCE_RUNS("
+  #include <stdlib.h>
+  #include <stdio.h>
+  float x[4] = { 1, 2, 3, 4 };
+  float y[4] = { .1, .01, .001, .0001 };
+  int four = 4;
+  int one = 1;
+  extern float sdot_();
+  int main() {
+    int i;
+    double r = sdot_(&four, x, &one, y, &one);
+    exit((float)r != (float).1234);
+  }" BLAS_F2C_FLOAT_WORKS )
+    IF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
+      MESSAGE(STATUS "This BLAS uses the F2C return conventions")
+      SET(BLAS_F2C TRUE)
+    ELSE (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
+      SET(BLAS_F2C FALSE)
+    ENDIF(BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
+    CHECK_C_SOURCE_RUNS("
+  #include <stdlib.h>
+  #include <stdio.h>
+  float x[4] = { 1, 2, 3, 4 };
+  float y[4] = { .1, .01, .001, .0001 };
+  extern float cblas_sdot();
+  int main() {
+    int i;
+    double r = cblas_sdot(4, x, 1, y, 1);
+    exit((float)r != (float).1234);
+  }" BLAS_USE_CBLAS_DOT )
+    IF (BLAS_USE_CBLAS_DOT)
+      SET(BLAS_USE_CBLAS_DOT TRUE)
+    ELSE (BLAS_USE_CBLAS_DOT)
+      SET(BLAS_USE_CBLAS_DOT FALSE)
+    ENDIF(BLAS_USE_CBLAS_DOT)
+    SET(CMAKE_REQUIRED_LIBRARIES)
+  ENDIF(CMAKE_CROSSCOMPILING)
   cmake_pop_check_state()
 ENDIF(BLAS_LIBRARIES)
 
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index 342318c2b41f9..b1d8306796473 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -64,31 +64,33 @@ if(CUDA_FOUND)
     "  return 0;\n"
     "}\n"
     )
-  try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
-    CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS}"
-    LINK_LIBRARIES ${CUDA_LIBRARIES}
-    RUN_OUTPUT_VARIABLE cuda_version_from_header
-    COMPILE_OUTPUT_VARIABLE output_var
-    )
-  if(NOT compile_result)
-    message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
-  endif()
-  message(STATUS "Caffe2: Header version is: " ${cuda_version_from_header})
-  if(NOT cuda_version_from_header STREQUAL ${CUDA_VERSION_STRING})
-    # Force CUDA to be processed for again next time
-    # TODO: I'm not sure if this counts as an implementation detail of
-    # FindCUDA
-    set(${cuda_version_from_findcuda} ${CUDA_VERSION_STRING})
-    unset(CUDA_TOOLKIT_ROOT_DIR_INTERNAL CACHE)
-    # Not strictly necessary, but for good luck.
-    unset(CUDA_VERSION CACHE)
-    # Error out
-    message(FATAL_ERROR "FindCUDA says CUDA version is ${cuda_version_from_findcuda} (usually determined by nvcc), "
-      "but the CUDA headers say the version is ${cuda_version_from_header}.  This often occurs "
-      "when you set both CUDA_HOME and CUDA_NVCC_EXECUTABLE to "
-      "non-standard locations, without also setting PATH to point to the correct nvcc.  "
-      "Perhaps, try re-running this command again with PATH=${CUDA_TOOLKIT_ROOT_DIR}/bin:$PATH.  "
-      "See above log messages for more diagnostics, and see https://github.com/pytorch/pytorch/issues/8092 for more details.")
+  if(NOT CMAKE_CROSSCOMPILING)
+    try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
+      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS}"
+      LINK_LIBRARIES ${CUDA_LIBRARIES}
+      RUN_OUTPUT_VARIABLE cuda_version_from_header
+      COMPILE_OUTPUT_VARIABLE output_var
+      )
+    if(NOT compile_result)
+      message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
+    endif()
+    message(STATUS "Caffe2: Header version is: " ${cuda_version_from_header})
+    if(NOT cuda_version_from_header STREQUAL ${CUDA_VERSION_STRING})
+      # Force CUDA to be processed for again next time
+      # TODO: I'm not sure if this counts as an implementation detail of
+      # FindCUDA
+      set(${cuda_version_from_findcuda} ${CUDA_VERSION_STRING})
+      unset(CUDA_TOOLKIT_ROOT_DIR_INTERNAL CACHE)
+      # Not strictly necessary, but for good luck.
+      unset(CUDA_VERSION CACHE)
+      # Error out
+      message(FATAL_ERROR "FindCUDA says CUDA version is ${cuda_version_from_findcuda} (usually determined by nvcc), "
+        "but the CUDA headers say the version is ${cuda_version_from_header}.  This often occurs "
+        "when you set both CUDA_HOME and CUDA_NVCC_EXECUTABLE to "
+        "non-standard locations, without also setting PATH to point to the correct nvcc.  "
+        "Perhaps, try re-running this command again with PATH=${CUDA_TOOLKIT_ROOT_DIR}/bin:$PATH.  "
+        "See above log messages for more diagnostics, and see https://github.com/pytorch/pytorch/issues/8092 for more details.")
+    endif()
   endif()
 endif()
 

From 64d61901eb36b20fc5ee52a93a3527d9278b8d52 Mon Sep 17 00:00:00 2001
From: Jagadish Krishnamoorthy <jagdish.krishna@gmail.com>
Date: Fri, 9 Jul 2021 09:29:05 -0700
Subject: [PATCH 021/122] [ROCm] Skip test_masked_scatter_large_tensor_cuda
 (#61313)

Summary:
Refer https://github.com/pytorch/pytorch/issues/60190. Skipping unit test until hipcub issue is fixed.

Signed-off-by: Jagadish Krishnamoorthy <jagdish.krishna@gmail.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61313

Reviewed By: iramazanli

Differential Revision: D29626664

Pulled By: malfet

fbshipit-source-id: db2a390d2a3e28ec05a5032a50aa9a35c86b96ca
---
 test/test_torch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test_torch.py b/test/test_torch.py
index f1bb1c9a4c73b..b4eaa05487ff9 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -5911,6 +5911,8 @@ def test_masked_scatter_bool_tensor(self, device):
         dst = dst.masked_scatter(mask, src)
         self.assertEqual(dst, torch.tensor([True, True, True], device=device))
 
+    # refer https://github.com/pytorch/pytorch/issues/60190
+    @skipIfRocm
     @onlyCUDA
     @largeTensorTest('30GB')
     def test_masked_scatter_large_tensor(self, device):

From 9e533a62f633d648ac6627ba1e55e3ef20932024 Mon Sep 17 00:00:00 2001
From: Akshit Khurana <axit@fb.com>
Date: Fri, 9 Jul 2021 09:48:41 -0700
Subject: [PATCH 022/122] Make conv2d nnapi converter accept flexible batch
 (#61021)

Summary:
Same as title

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61021

Test Plan: pytest test/test_nnapi.py::TestNNAPI

Reviewed By: anshuljain1

Differential Revision: D29480746

fbshipit-source-id: 7217c8f3a811db8c3c373f3e7ca31caf9502ef22
---
 test/test_nnapi.py                  | 2 +-
 torch/backends/_nnapi/serializer.py | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/test_nnapi.py b/test/test_nnapi.py
index dbf00ba61362f..e0d3ffbc4e09e 100644
--- a/test/test_nnapi.py
+++ b/test/test_nnapi.py
@@ -444,7 +444,7 @@ def test_conv2d(self):
                     output_size = model(inp).numel()
                     atol_rtol = None
                     limit = None
-                    convert_dims = input_dim[:2] + (0, 0)
+                    convert_dims = (0, in_ch, 0, 0)
                     convert_arg = torch.zeros(*convert_dims)
 
                     if "quant" in kind:
diff --git a/torch/backends/_nnapi/serializer.py b/torch/backends/_nnapi/serializer.py
index c8dae56f2877c..249aa65bb164c 100644
--- a/torch/backends/_nnapi/serializer.py
+++ b/torch/backends/_nnapi/serializer.py
@@ -1939,9 +1939,10 @@ def _handle_conv_pool_flexible_input(self, out_id, jit_image, args, transpose):
         image_id, image_oper = self.get_tensor_operand_by_jitval(jit_image)
         batch, in_ch, in_h, in_w = image_oper.shape
 
-        if batch == 0 or in_ch == 0:
-            raise Exception("Only H & W can be flexible")
-
+        if batch == 0:
+            self.forward_operand_shape(out_id, 0, image_id, 0)
+        if in_ch == 0:
+            raise Exception("Input channels can't be flexible")
         # H & W
         if transpose:
             if in_h == 0:

From 24a8915534e42ac3136c9d8df0158a5d75455198 Mon Sep 17 00:00:00 2001
From: Jeffrey Wan <jw3468@fb.com>
Date: Fri, 9 Jul 2021 09:52:29 -0700
Subject: [PATCH 023/122] Relax use-count check to allow for 0 (#61414)

Summary:
Previously we require tensor use count to be exactly 1. We should actually allow for use count to be zero as well. Use count is zero when an undefined tensor is returned, and this is common in backward functions that have multiple outputs.

In this PR I also remove some entries from the skip list that should be covered by this change: they return multiple tensors AND are backward functions. Batch norm is also known to return undefined tensors when `training=False`.

Related issue: https://github.com/pytorch/pytorch/issues/60426

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61414

Reviewed By: albanD

Differential Revision: D29614687

Pulled By: soulitzer

fbshipit-source-id: ab0892aed4bd1346b50b0a9552ffcc3287ac96af
---
 tools/autograd/gen_variable_type.py | 38 +++++++++--------------------
 1 file changed, 11 insertions(+), 27 deletions(-)

diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index fad6ad2f82bdf..3495c30d8d1a7 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -48,8 +48,7 @@
 from tools.codegen.model import (Argument, NativeFunction, SchemaKind,
                                  SelfArgument, TensorOptionsArguments,
                                  BaseType, ListType)
-from typing import Callable, List, Optional, Sequence, Union, Pattern, Set
-import re
+from typing import Callable, List, Optional, Sequence, Union
 
 # We don't set or modify grad_fn on these methods. Generally, they return
 # tensors that have requires_grad=False. In-place functions listed here will
@@ -180,8 +179,8 @@
 if (${tensor_name}_impl_saved) AT_ASSERT(${tensor_name}_impl_saved == ${tensor_name}.getIntrusivePtr());
 """)
 
-ENFORCE_TENSOR_IMPL_USE_COUNT_EQUALS_ONE = CodeTemplate("""\
-AT_ASSERT(${tensor_name}.use_count() == 1, "function: ${fn_name}");
+ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE = CodeTemplate("""\
+AT_ASSERT(${tensor_name}.use_count() <= 1, "function: ${fn_name}");
 """)
 
 ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE = CodeTemplate("""\
@@ -221,25 +220,20 @@
     # These functions are expected to change impl or storage of input tensors
     'set_', '_cudnn_rnn_flatten_weight',
 }
-DONT_ENFORCE_TENSOR_IMPL_USE_COUNT: Set[Union[str, Pattern[str]]] = {
-    # These non-inplace, non-out functions return tensors with use_count != 1
+DONT_ENFORCE_TENSOR_IMPL_USE_COUNT = {
+    # These non-inplace, non-out functions return tensors with use_count > 1
     # Therefore, they MAY (but not necessarily) return one of its inputs as-is
     # See https://github.com/pytorch/pytorch/issues/60426 for more information
-    'native_batch_norm', 'native_batch_norm_backward', 'native_group_norm_backward',
-    'cudnn_batch_norm', 'native_layer_norm_backward',
-
-    # TODO: we probably dont' want to skip all conv.*_backward
-    re.compile(r'conv.*_backward'),
     '_embedding_bag', '_embedding_bag_forward_only',
     'q_per_channel_scales', 'q_per_channel_zero_points',
-    'lu_unpack',
+    'lu_unpack', '_cudnn_rnn_backward',
 
     # The below failed StorageImpl use_count check but we skip tensor_impl check
     # just in case
-    '_cudnn_rnn', 'dequantize_self', '_cudnn_rnn_backward',
+    '_cudnn_rnn', 'dequantize_self',
 }
 
-DONT_ENFORCE_STORAGE_IMPL_USE_COUNT: Set[Union[str, Pattern[str]]] = {
+DONT_ENFORCE_STORAGE_IMPL_USE_COUNT = {
     # These non-view functions return tensors with storage use_count != 1
     'thnn_conv2d_forward', 'slow_conv3d_forward', 'channel_shuffle',
 
@@ -247,16 +241,6 @@
     # use count to be 1 either.
     *DONT_ENFORCE_TENSOR_IMPL_USE_COUNT,
 }
-
-def contains_string_or_matching_pattern(list: Set[Union[str, Pattern[str]]], cpp_name: str) -> bool:
-    for pattern in list:
-        if isinstance(pattern, str):
-            if cpp_name == pattern:
-                return True
-        else:
-            if pattern.search(cpp_name) is not None:
-                return True
-    return False
 # END CHECKS FOR [ TensorImpl and Storage Pointer Sanity Checks ]
 
 DECLARE_GRAD_FN = CodeTemplate("""\
@@ -750,12 +734,12 @@ def check_tensorimpl_and_storage(call: str, unpacked_bindings: List[Binding]) ->
                         stmts_after_call += [ENFORCE_SAME_TENSOR_STORAGE.substitute(tensor_name=aliased_arg_name,
                                                                                     out_tensor_name=ret_name)]
                     else:
-                        if not contains_string_or_matching_pattern(DONT_ENFORCE_STORAGE_IMPL_USE_COUNT, type_wrapper_name(f)):
+                        if type_wrapper_name(f) not in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT:
                             stmts_after_call += [ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE.substitute(
                                 tensor_name=ret_name, fn_name=type_wrapper_name(f))]
 
-                    if not contains_string_or_matching_pattern(DONT_ENFORCE_TENSOR_IMPL_USE_COUNT, type_wrapper_name(f)):
-                        stmts_after_call += [ENFORCE_TENSOR_IMPL_USE_COUNT_EQUALS_ONE.substitute(
+                    if type_wrapper_name(f) not in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
+                        stmts_after_call += [ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE.substitute(
                             tensor_name=ret_name, fn_name=type_wrapper_name(f))]
 
                 # Currently we don't have any functions that return the following types, but

From 09c90b3589a04fceb2fb1cb9a905c532d5058442 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 9 Jul 2021 10:43:56 -0700
Subject: [PATCH 024/122] relax type equality constraint (#60638)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/60638

Initial proposal in https://github.com/pytorch/pytorch/pull/58981#issuecomment-866690334. Opposed to the proposal, this PR only allows relaxing the type equality constraint to a common superclass constraint, for example `torch.Tensor` vs `torch.nn.Parameter`. Inputs that do not share a common superclass will still fail.

Test Plan: Imported from OSS

Reviewed By: soulitzer

Differential Revision: D29626811

Pulled By: mruberry

fbshipit-source-id: 1916c3b710d38889de7ce57eb0770c76cbbb8166
---
 test/test_testing.py      |  23 ++++++--
 torch/testing/_asserts.py | 111 +++++++++++++++++++++++++-------------
 2 files changed, 93 insertions(+), 41 deletions(-)

diff --git a/test/test_testing.py b/test/test_testing.py
index 897a44a8a0590..ca72d66b8ee03 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -725,13 +725,28 @@ def assert_close_with_inputs(actual: Any, expected: Any) -> Iterator[Callable]:
 
 
 class TestAssertClose(TestCase):
-    def test_type_inequality(self):
-        actual = torch.empty(2)
-        expected = actual.tolist()
+    def test_mismatching_types_subclasses(self):
+        actual = torch.empty(())
+        expected = torch.nn.Parameter(actual)
+
+        for fn in assert_close_with_inputs(actual, expected):
+            fn()
+
+    def test_mismatching_types_type_equality(self):
+        actual = torch.empty(())
+        expected = torch.nn.Parameter(actual)
 
         for fn in assert_close_with_inputs(actual, expected):
             with self.assertRaisesRegex(AssertionError, str(type(expected))):
-                fn()
+                fn(allow_subclasses=False)
+
+    def test_mismatching_types(self):
+        actual = torch.empty(2)
+        expected = actual.numpy()
+
+        for fn, allow_subclasses in itertools.product(assert_close_with_inputs(actual, expected), (True, False)):
+            with self.assertRaisesRegex(AssertionError, str(type(expected))):
+                fn(allow_subclasses=allow_subclasses)
 
     def test_unknown_type(self):
         actual = "0"
diff --git a/torch/testing/_asserts.py b/torch/testing/_asserts.py
index cc3f86a58273c..8636b134626ef 100644
--- a/torch/testing/_asserts.py
+++ b/torch/testing/_asserts.py
@@ -577,12 +577,34 @@ def _to_tensor(array_or_scalar_like: Any) -> Tuple[Optional[_TestingErrorMeta],
     return None, tensor
 
 
-def _to_tensor_pair(actual: Any, expected: Any) -> Tuple[Optional[_TestingErrorMeta], Optional[_TensorPair]]:
+def _check_types(actual: Any, expected: Any, *, allow_subclasses: bool) -> Optional[_TestingErrorMeta]:
+    # We exclude numbers here, since numbers of different type, e.g. int vs. float, should be treated the same as
+    # tensors with different dtypes. Without user input, passing numbers of different types will still fail, but this
+    # can be disabled by setting `check_dtype=False`.
+    if isinstance(actual, numbers.Number) and isinstance(expected, numbers.Number):
+        return None
+
+    msg_fmtstr = f"Except for Python scalars, {{}}, but got {type(actual)} and {type(expected)} instead."
+    directly_related = isinstance(actual, type(expected)) or isinstance(expected, type(actual))
+    if not directly_related:
+        return _TestingErrorMeta(AssertionError, msg_fmtstr.format("input types need to be directly related"))
+
+    if allow_subclasses or type(actual) is type(expected):
+        return None
+
+    return _TestingErrorMeta(AssertionError, msg_fmtstr.format("type equality is required if allow_subclasses=False"))
+
+
+def _to_tensor_pair(
+    actual: Any, expected: Any, *, allow_subclasses: bool
+) -> Tuple[Optional[_TestingErrorMeta], Optional[_TensorPair]]:
     """Converts a scalar-or-array-like pair to a :class:`_TensorPair`.
 
     Args:
         actual (Any): Actual array-or-scalar-like.
         expected (Any): Expected array-or-scalar-like.
+        allow_subclasses (bool): If ``True`` (default) and except for Python scalars, inputs of directly related types
+            are allowed. Otherwise type equality is required.
 
     Returns:
         (Optional[_TestingErrorMeta], Optional[_TensorPair]): The two elements are orthogonal, i.e. if the first is
@@ -590,18 +612,8 @@ def _to_tensor_pair(actual: Any, expected: Any) -> Tuple[Optional[_TestingErrorM
             :attr:`expected` are not scalars and do not have the same type. Additionally, returns any error meta from
             :func:`_to_tensor`.
     """
-    error_meta: Optional[_TestingErrorMeta]
-
-    # We exclude numbers here, since numbers of different type, e.g. int vs. float, should be treated the same as
-    # tensors with different dtypes. Without user input, passing numbers of different types will still fail, but this
-    # can be disabled by setting `check_dtype=False`.
-    if type(actual) is not type(expected) and not (
-        isinstance(actual, numbers.Number) and isinstance(expected, numbers.Number)
-    ):
-        error_meta = _TestingErrorMeta(
-            AssertionError,
-            f"Except for scalars, type equality is required, but got {type(actual)} and {type(expected)} instead.",
-        )
+    error_meta = _check_types(actual, expected, allow_subclasses=allow_subclasses)
+    if error_meta:
         return error_meta, None
 
     error_meta, actual = _to_tensor(actual)
@@ -616,7 +628,7 @@ def _to_tensor_pair(actual: Any, expected: Any) -> Tuple[Optional[_TestingErrorM
 
 
 def _parse_inputs(
-    actual: Any, expected: Any
+    actual: Any, expected: Any, *, allow_subclasses: bool
 ) -> Tuple[Optional[_TestingErrorMeta], Optional[Union[_TensorPair, List, Dict]]]:
     """Parses the positional inputs by constructing :class:`_TensorPair`'s from corresponding array-or-scalar-likes.
 
@@ -628,6 +640,8 @@ def _parse_inputs(
     Args:
         actual (Any): Actual input.
         expected (Any): Expected input.
+        allow_subclasses (bool): If ``True`` (default) and except for Python scalars, inputs of directly related types
+            are allowed. Otherwise type equality is required.
 
     Returns:
         (Tuple[Optional[_TestingErrorMeta], Optional[Union[_TensorPair, List, Dict]]]): The two elements are
@@ -656,7 +670,7 @@ def _parse_inputs(
 
         pair_list = []
         for idx in range(actual_len):
-            error_meta, pair = _parse_inputs(actual[idx], expected[idx])
+            error_meta, pair = _parse_inputs(actual[idx], expected[idx], allow_subclasses=allow_subclasses)
             if error_meta:
                 error_meta = error_meta.amend_msg(postfix=f"\n\n{_SEQUENCE_MSG_FMTSTR.format(idx)}")
                 return error_meta, None
@@ -681,7 +695,7 @@ def _parse_inputs(
 
         pair_dict = {}
         for key in sorted(actual_keys):
-            error_meta, pair = _parse_inputs(actual[key], expected[key])
+            error_meta, pair = _parse_inputs(actual[key], expected[key], allow_subclasses=allow_subclasses)
             if error_meta:
                 error_meta = error_meta.amend_msg(postfix=f"\n\n{_MAPPING_MSG_FMTSTR.format(key)}")
                 return error_meta, None
@@ -691,13 +705,14 @@ def _parse_inputs(
             return None, pair_dict
 
     else:
-        return _to_tensor_pair(actual, expected)
+        return _to_tensor_pair(actual, expected, allow_subclasses=allow_subclasses)
 
 
 def assert_close(
     actual: Any,
     expected: Any,
     *,
+    allow_subclasses: bool = True,
     rtol: Optional[float] = None,
     atol: Optional[float] = None,
     equal_nan: Union[bool, str] = False,
@@ -734,15 +749,23 @@ def assert_close(
     :meth:`~torch.Tensor.qscheme` and the result of :meth:`~torch.Tensor.dequantize` is close according to the
     definition above.
 
-    :attr:`actual` and :attr:`expected` can be :class:`~torch.Tensor`'s or any array-or-scalar-like of the same type,
-    from which :class:`torch.Tensor`'s can be constructed with :func:`torch.as_tensor`. In addition, :attr:`actual` and
-    :attr:`expected` can be :class:`~collections.abc.Sequence`'s or :class:`~collections.abc.Mapping`'s in which case
-    they are considered close if their structure matches and all their elements are considered close according to the
-    above definition.
+    :attr:`actual` and :attr:`expected` can be :class:`~torch.Tensor`'s or any tensor-or-scalar-likes from which
+    :class:`torch.Tensor`'s can be constructed with :func:`torch.as_tensor`. Except for Python scalars the input types
+    have to be directly related. In addition, :attr:`actual` and :attr:`expected` can be
+    :class:`~collections.abc.Sequence`'s or :class:`~collections.abc.Mapping`'s in which case they are considered close
+    if their structure matches and all their elements are considered close according to the above definition.
+
+    .. note::
+
+        Python scalars are an exception to the type relation requirement, because their :func:`type`, i.e.
+        :class:`int`, :class:`float`, and :class:`complex`, is equivalent to the ``dtype`` of a tensor-like. Thus,
+        Python scalars of different types can be checked, but require :attr:`check_dtype` to be set to ``False``.
 
     Args:
         actual (Any): Actual input.
         expected (Any): Expected input.
+        allow_subclasses (bool): If ``True`` (default) and except for Python scalars, inputs of directly related types
+            are allowed. Otherwise type equality is required.
         rtol (Optional[float]): Relative tolerance. If specified :attr:`atol` must also be specified. If omitted,
             default values based on the :attr:`~torch.Tensor.dtype` are selected with the below table.
         atol (Optional[float]): Absolute tolerance. If specified :attr:`rtol` must also be specified. If omitted,
@@ -766,19 +789,22 @@ def assert_close(
     Raises:
         UsageError: If a :class:`torch.Tensor` can't be constructed from an array-or-scalar-like.
         UsageError: If only :attr:`rtol` or :attr:`atol` is specified.
-        AssertionError: If corresponding array-likes have different types.
+        AssertionError: If corresponding inputs are not Python scalars and are not directly related.
+        AssertionError: If :attr:`allow_subclasses` is ``False``, but corresponding inputs are not Python scalars and
+            have different types.
         AssertionError: If the inputs are :class:`~collections.abc.Sequence`'s, but their length does not match.
         AssertionError: If the inputs are :class:`~collections.abc.Mapping`'s, but their set of keys do not match.
         AssertionError: If corresponding tensors do not have the same :attr:`~torch.Tensor.shape`.
         AssertionError: If corresponding tensors do not have the same :attr:`~torch.Tensor.layout`.
         AssertionError: If corresponding tensors are quantized, but have different :meth:`~torch.Tensor.qscheme`'s.
-        AssertionError: If :attr:`check_device`, but corresponding tensors are not on the same
+        AssertionError: If :attr:`check_device` is ``True``, but corresponding tensors are not on the same
             :attr:`~torch.Tensor.device`.
-        AssertionError: If :attr:`check_dtype`, but corresponding tensors do not have the same ``dtype``.
-        AssertionError: If :attr:`check_stride`, but corresponding strided tensors do not have the same stride.
-        AssertionError: If :attr:`check_is_coalesced`, but corresponding sparse COO tensors are not both either
-            coalesced or uncoalesced.
-        AssertionError: If the values of corresponding tensors are not close.
+        AssertionError: If :attr:`check_dtype` is ``True``, but corresponding tensors do not have the same ``dtype``.
+        AssertionError: If :attr:`check_stride` is ``True``, but corresponding strided tensors do not have the same
+            stride.
+        AssertionError: If :attr:`check_is_coalesced`  is ``True``, but corresponding sparse COO tensors are not both
+            either coalesced or uncoalesced.
+        AssertionError: If the values of corresponding tensors are not close according to the definition above.
 
     The following table displays the default ``rtol`` and ``atol`` for different ``dtype``'s. In case of mismatching
     ``dtype``'s, the maximum of both tolerances is used.
@@ -870,16 +896,27 @@ def assert_close(
         >>> actual = {"baz": baz, "bar": bar, "foo": foo}
         >>> torch.testing.assert_close(actual, expected)
 
-        >>> # Different input types are never considered close.
         >>> expected = torch.tensor([1.0, 2.0, 3.0])
-        >>> actual = expected.numpy()
-        >>> torch.testing.assert_close(actual, expected)
+        >>> actual = expected.clone()
+        >>> # By default, directly related instances can be compared
+        >>> torch.testing.assert_close(torch.nn.Parameter(actual), expected)
+        >>> # This check can be made more strict with allow_subclasses=False
+        >>> torch.testing.assert_close(
+        ...     torch.nn.Parameter(actual), expected, allow_subclasses=False
+        ... )
+        Traceback (most recent call last):
+        ...
+        AssertionError: Except for Python scalars, type equality is required if
+        allow_subclasses=False, but got <class 'torch.nn.parameter.Parameter'> and
+        <class 'torch.Tensor'> instead.
+        >>> # If the inputs are not directly related, they are never considered close
+        >>> torch.testing.assert_close(actual.numpy(), expected)
         Traceback (most recent call last):
         ...
-        AssertionError: Except for scalars, type equality is required, but got
-        <class 'numpy.ndarray'> and <class 'torch.Tensor'> instead.
-        >>> # Scalars of different types are an exception and can be compared with
-        >>> # check_dtype=False.
+        AssertionError: Except for Python scalars, input types need to be directly
+        related, but got <class 'numpy.ndarray'> and <class 'torch.Tensor'> instead.
+        >>> # Exceptions to these rules are Python scalars. They can be checked regardless of
+        >>> # their type if check_dtype=False.
         >>> torch.testing.assert_close(1.0, 1, check_dtype=False)
 
         >>> # NaN != NaN by default.
@@ -938,7 +975,7 @@ def assert_close(
             f"but got no {'rtol' if rtol is None else 'atol'}.",
         )
 
-    error_meta, pair = _parse_inputs(actual, expected)
+    error_meta, pair = _parse_inputs(actual, expected, allow_subclasses=allow_subclasses)
     if error_meta:
         raise error_meta.to_error()
     else:

From 5401dd2f9a9bbfcca1f128e2d89b58e934e1b196 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 9 Jul 2021 10:43:56 -0700
Subject: [PATCH 025/122] change language from array to tensor (#60639)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/60639

Test Plan: Imported from OSS

Reviewed By: iramazanli

Differential Revision: D29626812

Pulled By: mruberry

fbshipit-source-id: 1b0e78426fd08d7b72d890adc9811d31afd805fe
---
 torch/testing/_asserts.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/torch/testing/_asserts.py b/torch/testing/_asserts.py
index 8636b134626ef..9206a96699ea3 100644
--- a/torch/testing/_asserts.py
+++ b/torch/testing/_asserts.py
@@ -546,10 +546,10 @@ def _check_pair_close(
         return _check_tensors_close(pair.actual, pair.expected, **kwargs)
 
 
-def _to_tensor(array_or_scalar_like: Any) -> Tuple[Optional[_TestingErrorMeta], Optional[Tensor]]:
-    """Converts a scalar-or-array-like to a :class:`~torch.Tensor`.
+def _to_tensor(tensor_or_scalar_like: Any) -> Tuple[Optional[_TestingErrorMeta], Optional[Tensor]]:
+    """Converts a tensor-or-scalar-like to a :class:`~torch.Tensor`.
     Args:
-        array_or_scalar_like (Any): Scalar-or-array-like.
+        tensor_or_scalar_like (Any): Tensor-or-scalar-like.
     Returns:
 
         (Tuple[Optional[_TestingErrorMeta], Optional[Tensor]]): The two elements are orthogonal, i.e. if the first is
@@ -559,14 +559,14 @@ def _to_tensor(array_or_scalar_like: Any) -> Tuple[Optional[_TestingErrorMeta],
     """
     error_meta: Optional[_TestingErrorMeta]
 
-    if isinstance(array_or_scalar_like, Tensor):
-        tensor = array_or_scalar_like
+    if isinstance(tensor_or_scalar_like, Tensor):
+        tensor = tensor_or_scalar_like
     else:
         try:
-            tensor = torch.as_tensor(array_or_scalar_like)
+            tensor = torch.as_tensor(tensor_or_scalar_like)
         except Exception:
             error_meta = _TestingErrorMeta(
-                UsageError, f"No tensor can be constructed from type {type(array_or_scalar_like)}."
+                UsageError, f"No tensor can be constructed from type {type(tensor_or_scalar_like)}."
             )
             return error_meta, None
 
@@ -598,11 +598,11 @@ def _check_types(actual: Any, expected: Any, *, allow_subclasses: bool) -> Optio
 def _to_tensor_pair(
     actual: Any, expected: Any, *, allow_subclasses: bool
 ) -> Tuple[Optional[_TestingErrorMeta], Optional[_TensorPair]]:
-    """Converts a scalar-or-array-like pair to a :class:`_TensorPair`.
+    """Converts a tensor-or-scalar-like pair to a :class:`_TensorPair`.
 
     Args:
-        actual (Any): Actual array-or-scalar-like.
-        expected (Any): Expected array-or-scalar-like.
+        actual (Any): Actual tensor-or-scalar-like.
+        expected (Any): Expected tensor-or-scalar-like.
         allow_subclasses (bool): If ``True`` (default) and except for Python scalars, inputs of directly related types
             are allowed. Otherwise type equality is required.
 
@@ -630,7 +630,7 @@ def _to_tensor_pair(
 def _parse_inputs(
     actual: Any, expected: Any, *, allow_subclasses: bool
 ) -> Tuple[Optional[_TestingErrorMeta], Optional[Union[_TensorPair, List, Dict]]]:
-    """Parses the positional inputs by constructing :class:`_TensorPair`'s from corresponding array-or-scalar-likes.
+    """Parses the positional inputs by constructing :class:`_TensorPair`'s from corresponding tensor-or-scalar-likes.
 
 
     :class:`~collections.abc.Sequence`'s or :class:`~collections.abc.Mapping`'s are parsed elementwise. Parsing is
@@ -787,7 +787,7 @@ def assert_close(
             the mismatching tensors and a namespace of diagnostics about the mismatches. See below for details.
 
     Raises:
-        UsageError: If a :class:`torch.Tensor` can't be constructed from an array-or-scalar-like.
+        UsageError: If no :class:`torch.Tensor` can be constructed from an input.
         UsageError: If only :attr:`rtol` or :attr:`atol` is specified.
         AssertionError: If corresponding inputs are not Python scalars and are not directly related.
         AssertionError: If :attr:`allow_subclasses` is ``False``, but corresponding inputs are not Python scalars and

From 682ebc1dd11c6cd14ca5c855e1fa0734b1bc70c5 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 9 Jul 2021 10:43:56 -0700
Subject: [PATCH 026/122] remove UsageError in favor of ValueError (#61031)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61031

See https://github.com/pytorch/pytorch/pull/58916#issuecomment-868519515.

Test Plan: Imported from OSS

Reviewed By: iramazanli

Differential Revision: D29626810

Pulled By: mruberry

fbshipit-source-id: 25ddf26815f9ef82b8234d7dac811a6a13a53c54
---
 test/test_testing.py      |  9 ++++-----
 torch/testing/_asserts.py | 25 +++++++------------------
 2 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/test/test_testing.py b/test/test_testing.py
index ca72d66b8ee03..ca85e5647164b 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -18,7 +18,6 @@
      deviceCountAtLeast)
 from torch.testing._internal.common_methods_invocations import op_db
 import torch.testing._internal.opinfo_helper as opinfo_helper
-from torch.testing._asserts import UsageError
 
 # For testing TestCase methods and torch.testing functions
 class TestTesting(TestCase):
@@ -753,7 +752,7 @@ def test_unknown_type(self):
         expected = "0"
 
         for fn in assert_close_with_inputs(actual, expected):
-            with self.assertRaisesRegex(UsageError, str(type(actual))):
+            with self.assertRaisesRegex(ValueError, str(type(actual))):
                 fn()
 
     def test_mismatching_shape(self):
@@ -770,7 +769,7 @@ def test_unknown_layout(self):
         expected = actual.to_mkldnn()
 
         for fn in assert_close_with_inputs(actual, expected):
-            with self.assertRaises(UsageError):
+            with self.assertRaises(ValueError):
                 fn()
 
     def test_mismatching_layout(self):
@@ -817,7 +816,7 @@ def test_only_rtol(self):
         expected = actual.clone()
 
         for fn in assert_close_with_inputs(actual, expected):
-            with self.assertRaises(UsageError):
+            with self.assertRaises(ValueError):
                 fn(rtol=0.0)
 
     def test_only_atol(self):
@@ -825,7 +824,7 @@ def test_only_atol(self):
         expected = actual.clone()
 
         for fn in assert_close_with_inputs(actual, expected):
-            with self.assertRaises(UsageError):
+            with self.assertRaises(ValueError):
                 fn(atol=0.0)
 
     def test_mismatching_values(self):
diff --git a/torch/testing/_asserts.py b/torch/testing/_asserts.py
index 9206a96699ea3..172e2cc0fbef0 100644
--- a/torch/testing/_asserts.py
+++ b/torch/testing/_asserts.py
@@ -12,24 +12,14 @@
 __all__ = ["assert_close"]
 
 
-# The UsageError should be raised in case the test function is not used correctly. With this the user is able to
-# differentiate between a test failure (there is a bug in the tested code) and a test error (there is a bug in the
-# test).
-class UsageError(Exception):
-    pass
-
-
-_TestingError = Union[AssertionError, UsageError]
-
-
 class _TestingErrorMeta(NamedTuple):
-    type: Type[_TestingError]
+    type: Type[Exception]
     msg: str
 
     def amend_msg(self, prefix: str = "", postfix: str = "") -> "_TestingErrorMeta":
         return self._replace(msg=f"{prefix}{self.msg}{postfix}")
 
-    def to_error(self) -> _TestingError:
+    def to_error(self) -> Exception:
         return self.type(self.msg)
 
 
@@ -203,9 +193,8 @@ def _check_supported_tensor(input: Tensor) -> Optional[_TestingErrorMeta]:
     Returns:
         (Optional[_TestingErrorMeta]): If check did not pass.
     """
-
     if input.layout not in {torch.strided, torch.sparse_coo, torch.sparse_csr}:  # type: ignore[attr-defined]
-        return _TestingErrorMeta(UsageError, f"Unsupported tensor layout {input.layout}")
+        return _TestingErrorMeta(ValueError, f"Unsupported tensor layout {input.layout}")
 
     return None
 
@@ -566,7 +555,7 @@ def _to_tensor(tensor_or_scalar_like: Any) -> Tuple[Optional[_TestingErrorMeta],
             tensor = torch.as_tensor(tensor_or_scalar_like)
         except Exception:
             error_meta = _TestingErrorMeta(
-                UsageError, f"No tensor can be constructed from type {type(tensor_or_scalar_like)}."
+                ValueError, f"No tensor can be constructed from type {type(tensor_or_scalar_like)}."
             )
             return error_meta, None
 
@@ -787,8 +776,8 @@ def assert_close(
             the mismatching tensors and a namespace of diagnostics about the mismatches. See below for details.
 
     Raises:
-        UsageError: If no :class:`torch.Tensor` can be constructed from an input.
-        UsageError: If only :attr:`rtol` or :attr:`atol` is specified.
+        ValueError: If no :class:`torch.Tensor` can be constructed from an input.
+        ValueError: If only :attr:`rtol` or :attr:`atol` is specified.
         AssertionError: If corresponding inputs are not Python scalars and are not directly related.
         AssertionError: If :attr:`allow_subclasses` is ``False``, but corresponding inputs are not Python scalars and
             have different types.
@@ -970,7 +959,7 @@ def assert_close(
     if (rtol is None) ^ (atol is None):
         # We require both tolerance to be omitted or specified, because specifying only one might lead to surprising
         # results. Imagine setting atol=0.0 and the tensors still match because rtol>0.0.
-        raise UsageError(
+        raise ValueError(
             f"Both 'rtol' and 'atol' must be either specified or omitted, "
             f"but got no {'rtol' if rtol is None else 'atol'}.",
         )

From 962c9fbf856bfc0b564b7fc6fbb83b0cdad8e469 Mon Sep 17 00:00:00 2001
From: Karen Zhou <kazhou@fb.com>
Date: Fri, 9 Jul 2021 10:46:18 -0700
Subject: [PATCH 027/122] [pruner] add handles for hooks (#61425)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61425

Adding handle for activation reconstruction and bias forward hooks so they can be removed later
ghstack-source-id: 133244536

Test Plan:
This change should not affect behavior yet, but to double check:

`buck test mode/dev-nosan //caffe2/test:ao -- TestBasePruner`

https://pxl.cl/1LpM9

Reviewed By: z-a-f

Differential Revision: D29619720

fbshipit-source-id: c7428d2d0325cd11ce7919e0b67321e8cc196041
---
 torch/ao/sparsity/experimental/pruner/base_pruner.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/torch/ao/sparsity/experimental/pruner/base_pruner.py b/torch/ao/sparsity/experimental/pruner/base_pruner.py
index 88680ea73d891..cd5b55aedc75a 100644
--- a/torch/ao/sparsity/experimental/pruner/base_pruner.py
+++ b/torch/ao/sparsity/experimental/pruner/base_pruner.py
@@ -59,6 +59,8 @@ def __init__(self, model, config, defaults):
 
         self.module_groups = []
         self.enable_mask_update = False
+        self.activation_handle = None
+        self.bias_handle = None
 
         self.model = model
         # If no config -- try getting all the supported layers
@@ -130,14 +132,14 @@ def prepare(self, use_path=False, *args, **kwargs):
                                                  param(module.mask),
                                                  unsafe=True)
 
-            module.register_forward_hook(
+            self.activation_handle = module.register_forward_hook(
                 ActivationReconstruction(module.parametrizations.weight[0])
             )
 
             if module.bias is not None:
                 module.register_parameter('_bias', nn.Parameter(module.bias.detach()))
                 module.bias = None
-            module.register_forward_hook(self.bias_hook)
+            self.bias_handle = module.register_forward_hook(self.bias_hook)
 
 
     def convert(self, use_path=False, *args, **kwargs):

From b5c464d5ef3d0783196d1d94bbf2f9c231906d0e Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@fb.com>
Date: Fri, 9 Jul 2021 11:07:47 -0700
Subject: [PATCH 028/122] Make Future store weak pointers to storages (#60943)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/60943

In https://github.com/pytorch/pytorch/pull/60470 we made Future store Storages rather than store references to their DataPtrs (because these references could go stale...). However this meant that the Future could keep the Storage alive, and thus keep its memory allocated, even after the user was done with it. We fix it here by instead storing a weak ptr to that Storage (well, in fact to the StorageImpl, but it's the same).
ghstack-source-id: 133295799

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D29454104

fbshipit-source-id: d36dee00a4841c087bb7b3f5bc39e0459f209cdb
---
 aten/src/ATen/core/ivalue.cpp                 | 16 ++++-----
 aten/src/ATen/core/ivalue_inl.h               | 36 +++++++++++--------
 c10/core/Storage.h                            |  4 +++
 torch/csrc/distributed/rpc/message.cpp        |  7 ++--
 torch/csrc/distributed/rpc/message.h          |  6 ++--
 .../rpc/request_callback_no_python.cpp        |  3 +-
 .../csrc/distributed/rpc/tensorpipe_agent.cpp |  3 +-
 7 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index e6e8d809a631d..6fab54ff9dd82 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -945,9 +945,9 @@ getClassConverter() {
 }
 
 // Needs to be in this .cpp file to access the full definition of PyObjectHolder
-std::vector<c10::Storage> ivalue::Future::extractStorages(
+std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> ivalue::Future::extractStorages(
     const at::IValue& value) {
-  std::vector<c10::Storage> storages;
+  std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> weakStorageImpls;
   // getSubValues works poorly on Python objects: it only works if they can be
   // converted to a "regular" IValue type hence, for example, it doesn't support
   // custom subclasses. Thus, instead, we extract the tensors through pickling.
@@ -966,16 +966,16 @@ std::vector<c10::Storage> ivalue::Future::extractStorages(
         num_storages += 1;
       }
     }
-    storages.reserve(num_storages);
+    weakStorageImpls.reserve(num_storages);
     for (const at::Tensor& tensor : tensors) {
       if (tensor.is_sparse()) {
         // Sparse tensor is indices and values. Both are tensors
         // and contain storage.
-        storages.push_back(tensor.indices().storage());
-        storages.push_back(tensor.values().storage());
+        weakStorageImpls.push_back(tensor.indices().storage().getWeakStorageImpl());
+        weakStorageImpls.push_back(tensor.values().storage().getWeakStorageImpl());
       } else {
         // A dense/strided tensor contains 1 storage
-        storages.push_back(tensor.storage());
+        weakStorageImpls.push_back(tensor.storage().getWeakStorageImpl());
       }
     }
   } else {
@@ -985,11 +985,11 @@ std::vector<c10::Storage> ivalue::Future::extractStorages(
     value.getSubValues(sub_values);
     for (const at::IValue& sub_value : sub_values) {
       if (sub_value.isTensor()) {
-        storages.push_back(sub_value.toTensor().storage());
+        weakStorageImpls.push_back(sub_value.toTensor().storage().getWeakStorageImpl());
       }
     }
   }
-  return storages;
+  return weakStorageImpls;
 }
 
 TORCH_API intrusive_ptr<ivalue::Future> collectAll(
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index d9ff52842947a..d22dfecd0a43f 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -374,14 +374,15 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
    * IValue::getSubValues() or through pickling in case of Python object; or
    * when 2) customized storage extraction is more efficient.
    */
+  using WeakStorage = c10::weak_intrusive_ptr<c10::StorageImpl>;
   void markCompleted(
       IValue value,
-      c10::optional<std::vector<c10::Storage>> storages = c10::nullopt) {
+      c10::optional<std::vector<WeakStorage>> storages = c10::nullopt) {
     // Start by performing all steps that can throw, before setting any field.
     // Do this before even acquiring the mutex, because extractStorages might
     // acquire the GIL, which could lead to a lock inversion with our mutex.
     // See https://github.com/pytorch/pytorch/issues/58239.
-    std::vector<c10::Storage> actualStorages;
+    std::vector<WeakStorage> actualStorages;
     std::vector<c10::Device> usedDevices;
     try {
       // FIXME We should always extract DataPtrs, in order to catch the case of
@@ -481,7 +482,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
 
   // This accessor should only be used if we know that the future is
   // completed() with no error.
-  const std::vector<c10::Storage>& storages() const {
+  const std::vector<WeakStorage>& storages() const {
     std::unique_lock<std::mutex> lock(mutex_);
     AT_ASSERT(completed());
     AT_ASSERT(!eptr_);
@@ -517,7 +518,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
    */
   template <typename T>
   c10::intrusive_ptr<Future> then(T callback, TypePtr type) {
-    using IValueWithStorages = std::tuple<IValue, std::vector<c10::Storage>>;
+    using IValueWithStorages = std::tuple<IValue, std::vector<WeakStorage>>;
 #if __cpp_lib_is_invocable >= 201703
     static_assert(
         guts::disjunction<
@@ -535,7 +536,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
             IValueWithStorages>::value>(
             [&](auto identity) {
               IValue value;
-              std::vector<c10::Storage> storages;
+              std::vector<WeakStorage> storages;
               std::tie(value, storages) = identity(cb)(parentFut);
               childFut->markCompleted(std::move(value), std::move(storages));
             },
@@ -658,11 +659,14 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
       event.block(impl_.getStream(event.device()));
     }
 
-    for (const c10::Storage& storage : storages_) {
-      const at::DataPtr& data_ptr = storage.data_ptr();
-      if (!data_ptr.device().is_cpu()) {
+    for (const WeakStorage& weak_storage : storages_) {
+      c10::intrusive_ptr<c10::StorageImpl> storage = weak_storage.lock();
+      if (!storage) {
+        continue;
+      }
+      if (!storage->device().is_cpu()) {
         impl_.recordDataPtrOnStream(
-            data_ptr, impl_.getStream(data_ptr.device()));
+            storage->data_ptr(), impl_.getStream(storage->device()));
       }
     }
   }
@@ -702,16 +706,20 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
   }
 
   // Defined in ivalue.cpp.
-  static std::vector<c10::Storage> extractStorages(
+  static std::vector<WeakStorage> extractStorages(
       const at::IValue& value);
 
   static std::vector<c10::Device> getDevicesOfStorages(
       const c10::impl::VirtualGuardImpl& impl,
-      const std::vector<c10::Storage>& storages) {
+      const std::vector<WeakStorage>& storages) {
     c10::DeviceIndex deviceCount = impl.deviceCount();
     std::vector<bool> isDeviceUsed(deviceCount, false);
-    for (const c10::Storage& storage : storages) {
-      c10::Device device = storage.device();
+    for (const WeakStorage& weak_storage : storages) {
+      c10::intrusive_ptr<c10::StorageImpl> storage = weak_storage.lock();
+      if (!storage) {
+        continue;
+      }
+      c10::Device device = storage->device();
       if (!device.is_cpu()) {
         TORCH_CHECK_VALUE(
             device.type() == impl.type(),
@@ -843,7 +851,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
 
   // A cached version of the storages extracted from the value when the future
   // is first marked completed.
-  std::vector<c10::Storage> storages_;
+  std::vector<WeakStorage> storages_;
 
   // The bounding set of devices that this future, and any of its children, is
   // allowed to use. This is a superset of the set of devices used by the events
diff --git a/c10/core/Storage.h b/c10/core/Storage.h
index f8df22b55e695..11c7d396fa221 100644
--- a/c10/core/Storage.h
+++ b/c10/core/Storage.h
@@ -118,6 +118,10 @@ struct C10_API Storage {
     return storage_impl_.get();
   }
 
+  c10::weak_intrusive_ptr<StorageImpl> getWeakStorageImpl() const {
+    return c10::weak_intrusive_ptr<StorageImpl>(storage_impl_);
+  }
+
   operator bool() const {
     return storage_impl_;
   }
diff --git a/torch/csrc/distributed/rpc/message.cpp b/torch/csrc/distributed/rpc/message.cpp
index ef34ca66d2d18..02771140f69bb 100644
--- a/torch/csrc/distributed/rpc/message.cpp
+++ b/torch/csrc/distributed/rpc/message.cpp
@@ -66,11 +66,12 @@ void Message::setId(int64_t id) {
   id_ = id;
 }
 
-std::vector<c10::Storage> Message::getStorages() const {
-  std::vector<c10::Storage> storages;
+std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> Message::getStorages()
+    const {
+  std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> storages;
   storages.reserve(tensors_.size());
   for (const auto& tensor : tensors_) {
-    storages.emplace_back(tensor.storage());
+    storages.emplace_back(tensor.storage().getWeakStorageImpl());
   }
   return storages;
 }
diff --git a/torch/csrc/distributed/rpc/message.h b/torch/csrc/distributed/rpc/message.h
index 733c757b7ad03..2ad3baacc07ed 100644
--- a/torch/csrc/distributed/rpc/message.h
+++ b/torch/csrc/distributed/rpc/message.h
@@ -153,7 +153,7 @@ class TORCH_API Message final : public torch::CustomClassHolder {
   int64_t id() const;
   void setId(int64_t id);
 
-  std::vector<c10::Storage> getStorages() const;
+  std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> getStorages() const;
 
  private:
   std::vector<char> payload_;
@@ -178,7 +178,9 @@ TORCH_API c10::intrusive_ptr<Message> createExceptionResponse(
     const std::string& exceptionStr,
     int64_t id);
 
-inline std::tuple<c10::intrusive_ptr<Message>, std::vector<c10::Storage>>
+inline std::tuple<
+    c10::intrusive_ptr<Message>,
+    std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>>>
 withStorages(c10::intrusive_ptr<Message> message) {
   auto storages = message->getStorages();
   return std::make_tuple(std::move(message), std::move(storages));
diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
index e9a3b5e34e52b..cd9b65d394039 100644
--- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
@@ -605,7 +605,8 @@ c10::intrusive_ptr<JitFuture> RequestCallbackNoPython::asFuture(
   auto future = c10::make_intrusive<JitFuture>(
       at::getCustomClassType<c10::intrusive_ptr<Message>>(),
       RpcAgent::getCurrentRpcAgent()->getDevices());
-  std::vector<c10::Storage> storages = message->getStorages();
+  std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> storages =
+      message->getStorages();
   future->markCompleted(std::move(message), std::move(storages));
   return future;
 }
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 25f946aa4b885..0dbf5589e4880 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -1249,7 +1249,8 @@ void TensorPipeAgent::markFutureAsComplete(
                      message{std::move(message)},
                      streams{std::move(streams)}]() mutable {
       c10::MultiStreamGuard guard(streams);
-      std::vector<c10::Storage> storages = message->getStorages();
+      std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> storages =
+          message->getStorages();
       atomicFuture->jitFuture->markCompleted(
           std::move(message), std::move(storages));
       // The future's callbacks may schedule further RPCs, increasing the count.

From c830db02653a5ab555d966a3252426cfe8b6737a Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@fb.com>
Date: Fri, 9 Jul 2021 11:25:42 -0700
Subject: [PATCH 029/122] Raise error in CMake for CUDA <9.2 (#61462)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61462

Anything before CUDA 9.2 is not supported (see https://github.com/pytorch/pytorch/pull/36848), and perhaps not even that.
ghstack-source-id: 133312018

Test Plan: CI

Reviewed By: samestep

Differential Revision: D29637251

fbshipit-source-id: 4300169b7298274b2074649342902a34bd2220b5
---
 cmake/public/cuda.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index b1d8306796473..bb7c6a5226f6c 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -38,8 +38,8 @@ endif()
 message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION})
 message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
 message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
-if(CUDA_VERSION VERSION_LESS 9.0)
-  message(FATAL_ERROR "PyTorch requires CUDA 9.0 and above.")
+if(CUDA_VERSION VERSION_LESS 9.2)
+  message(FATAL_ERROR "PyTorch requires CUDA 9.2 or above.")
 endif()
 
 if(CUDA_FOUND)

From 86463a8d0276c4a7d7a0d499e46f7a21e34fde78 Mon Sep 17 00:00:00 2001
From: Santiago Castro <sacastro@umich.edu>
Date: Fri, 9 Jul 2021 11:30:11 -0700
Subject: [PATCH 030/122] Save some little memory in `default_collate` (#61424)

Summary:
It can be a non-little save if there are many workers and a large batch size.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61424

Reviewed By: soulitzer

Differential Revision: D29635477

Pulled By: ejguan

fbshipit-source-id: 1fc48b5964e873bd8833ad81bed9d51b0b6d137e
---
 torch/utils/data/_utils/collate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/utils/data/_utils/collate.py b/torch/utils/data/_utils/collate.py
index e520de6ebee91..19160c75cb485 100644
--- a/torch/utils/data/_utils/collate.py
+++ b/torch/utils/data/_utils/collate.py
@@ -50,7 +50,7 @@ def default_collate(batch):
         if torch.utils.data.get_worker_info() is not None:
             # If we're in a background process, concatenate directly into a
             # shared memory tensor to avoid an extra copy
-            numel = sum([x.numel() for x in batch])
+            numel = sum(x.numel() for x in batch)
             storage = elem.storage()._new_shared(numel)
             out = elem.new(storage)
         return torch.stack(batch, 0, out=out)

From b01329b1645396a85e2f97e17e6082bd3e9b45fe Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@fb.com>
Date: Fri, 9 Jul 2021 11:52:03 -0700
Subject: [PATCH 031/122] [xplat] Update XNNPACK to github revision 79cd5f9
 (#61400)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61400

allow-large-files Update XNNPACK to github version 79cd5f9.

Test Plan:
Spark apps build works.

Hand tracking works:

https://pxl.cl/1L76g

Reviewed By: dreiss

Differential Revision: D29385882

fbshipit-source-id: 6be920a68b876faedf7e86e33df43f8b1db14a4d
---
 third_party/XNNPACK | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/XNNPACK b/third_party/XNNPACK
index 55d53a4e7079d..79cd5f9e18ad0 160000
--- a/third_party/XNNPACK
+++ b/third_party/XNNPACK
@@ -1 +1 @@
-Subproject commit 55d53a4e7079d38e90acd75dd9e4f9e781d2da35
+Subproject commit 79cd5f9e18ad0925ac9a050b00ea5a36230072db

From 6bb33d93ab94bb268d7cfb600c700585720bcdde Mon Sep 17 00:00:00 2001
From: Dimitrije Jankov <dimitrijejankov@fb.com>
Date: Fri, 9 Jul 2021 12:18:24 -0700
Subject: [PATCH 032/122] disable the format library in C10 (#60052)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/60052

Introduction:
We would like to use the minimal implementation of C10 for our our SGX port of pytorch. This would include disabling signal handlers and the fmt library.

Problem :
When C10_SUPPORTS_SIGNAL_HANDLER is disabled there is no reason to have fmt enabled as it is used only in stacktraceSignalHandler. The problem is that fmt/format.h is included regardless whether C10_SUPPORTS_SIGNAL_HANDLER is disabled or not.

Solution :
Move the #include <fmt/format.h> inside the #ifdef section of code where  C10_SUPPORTS_SIGNAL_HANDLER is checked.

Test Plan: Run the pytorch unit tests.

Reviewed By: h397wang, LiJihang

Differential Revision: D29022628

fbshipit-source-id: 638cf98381585cd6059129d9c5a65d9e6a841575
---
 c10/util/signal_handler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c10/util/signal_handler.cpp b/c10/util/signal_handler.cpp
index fda1c6410dd44..7eda700979fa8 100644
--- a/c10/util/signal_handler.cpp
+++ b/c10/util/signal_handler.cpp
@@ -1,6 +1,5 @@
 #include <c10/util/Backtrace.h>
 #include <c10/util/signal_handler.h>
-#include <fmt/format.h>
 
 #if defined(C10_SUPPORTS_SIGNAL_HANDLER)
 
@@ -8,6 +7,7 @@
 #include <cxxabi.h>
 #include <dirent.h>
 #include <dlfcn.h>
+#include <fmt/format.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <unistd.h>

From 8296cb37c726bfa02e0a1708ebf87925cc37dbe6 Mon Sep 17 00:00:00 2001
From: Aliaksandr Ivanou <aivanou@fb.com>
Date: Fri, 9 Jul 2021 12:25:51 -0700
Subject: [PATCH 033/122] [torchelastic] Set the correct maximum border width

Summary: The diff sets the correct max border delimiters between error sections

Test Plan: Example of the uncontrolled border: https://www.internalfb.com/intern/testinfra/diagnostics/7599824415964133.844424970500348.1625590344/

Reviewed By: kiukchung

Differential Revision: D29636814

fbshipit-source-id: 95465d3150066bff82dc7499bb1c63ea4f5ebc2d
---
 torch/distributed/elastic/multiprocessing/errors/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/distributed/elastic/multiprocessing/errors/__init__.py b/torch/distributed/elastic/multiprocessing/errors/__init__.py
index 892a4c301bf1d..7746dbace9af5 100644
--- a/torch/distributed/elastic/multiprocessing/errors/__init__.py
+++ b/torch/distributed/elastic/multiprocessing/errors/__init__.py
@@ -245,7 +245,7 @@ def format_msg(self, boarder_delim="*", section_delim="="):
                 other_failures_fmt.append(fmt)
 
         # upper boundary on width
-        width = max(width, 250)
+        width = min(width, 250)
 
         return Template(_MSG_FORMAT_TEMPLATE).substitute(
             boarder=boarder_delim * width,

From 3b004aed3afd25ca48b5e6fb810aebadc7ac9d6d Mon Sep 17 00:00:00 2001
From: Elton Leander Pinto <eltonpinto@fb.com>
Date: Fri, 9 Jul 2021 12:43:42 -0700
Subject: [PATCH 034/122] Enable local clang-tidy lint (#61121)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61121

This change enables the make target to run clang-tidy locally

Test Plan:
Run this command
```
make clang-tidy
```
This should run `clang-tidy` on the paths and filters specified in `tools/linter/clang_tidy/__main__.py`

Quicklint
```
make quicklint
```
This should report "No files detected" if no c/cpp files are altered.

Reviewed By: soulitzer

Differential Revision: D29598927

Pulled By: 1ntEgr8

fbshipit-source-id: aa443030494fed92c313da4b203a5450be09fa38
---
 Makefile                            |  10 +--
 tools/actions_local_runner.py       |  28 +++++++-
 tools/linter/clang_tidy/__main__.py |  75 ++++++++++++++++++--
 tools/linter/clang_tidy/run.py      | 105 +++++++++++++++++++---------
 4 files changed, 175 insertions(+), 43 deletions(-)

diff --git a/Makefile b/Makefile
index 94688032c4249..63cbef8732a91 100644
--- a/Makefile
+++ b/Makefile
@@ -71,6 +71,7 @@ setup_lint:
 	fi
 	pip install jinja2
 	pip install -r tools/linter/clang_tidy/requirements.txt
+	$(PYTHON) -m tools.linter.install.clang_tidy
 
 quick_checks:
 # TODO: This is broken when 'git config submodule.recurse' is 'true' since the
@@ -104,9 +105,10 @@ cmakelint:
 		--job 'cmakelint' \
 		--step 'Run cmakelint'
 
-clang_tidy:
-	echo "clang-tidy local lint is not yet implemented"
-	exit 1
+clang-tidy:
+	@$(PYTHON) tools/actions_local_runner.py \
+		$(CHANGED_ONLY) \
+		--job 'clang-tidy'
 
 toc:
 	@$(PYTHON) tools/actions_local_runner.py \
@@ -117,4 +119,4 @@ toc:
 lint: flake8 mypy quick_checks cmakelint shellcheck
 
 quicklint: CHANGED_ONLY=--changed-only
-quicklint: mypy flake8 quick_checks cmakelint shellcheck
+quicklint: mypy flake8 quick_checks cmakelint shellcheck clang-tidy
diff --git a/tools/actions_local_runner.py b/tools/actions_local_runner.py
index 3968057107b30..3a0c745da3e9d 100755
--- a/tools/actions_local_runner.py
+++ b/tools/actions_local_runner.py
@@ -270,7 +270,8 @@ def filter_files(self, files: List[str]) -> List[str]:
 
     async def quick(self, files: List[str]) -> CommandResult:
         return await shell_cmd(
-            ["tools/linter/run_shellcheck.sh"] + [os.path.join(REPO_ROOT, f) for f in files],
+            ["tools/linter/run_shellcheck.sh"]
+            + [os.path.join(REPO_ROOT, f) for f in files],
         )
 
     async def full(self) -> None:
@@ -289,6 +290,30 @@ async def full(self) -> None:
         )
 
 
+class ClangTidy(Check):
+    name = "clang-tidy: Run clang-tidy"
+    common_options = [
+        "--clang-tidy-exe",
+        ".clang-tidy-bin/clang-tidy",
+        "--parallel",
+    ]
+
+    def filter_files(self, files: List[str]) -> List[str]:
+        return self.filter_ext(files, {".c", ".cc", ".cpp"})
+
+    async def quick(self, files: List[str]) -> CommandResult:
+        return await shell_cmd(
+            [sys.executable, "tools/linter/clang_tidy", "--paths"]
+            + [os.path.join(REPO_ROOT, f) for f in files]
+            + self.common_options,
+        )
+
+    async def full(self) -> None:
+        await shell_cmd(
+            [sys.executable, "tools/linter/clang_tidy"] + self.common_options
+        )
+
+
 class YamlStep(Check):
     def __init__(self, step: Dict[str, Any], job_name: str, quiet: bool):
         super().__init__(files=None, quiet=quiet)
@@ -405,6 +430,7 @@ def main() -> None:
     "mypy": Mypy,
     "flake8-py3": Flake8,
     "shellcheck": ShellCheck,
+    "clang-tidy": ClangTidy,
 }
 
 if __name__ == "__main__":
diff --git a/tools/linter/clang_tidy/__main__.py b/tools/linter/clang_tidy/__main__.py
index 0254ddc43b37e..3c36ffe7ac652 100644
--- a/tools/linter/clang_tidy/__main__.py
+++ b/tools/linter/clang_tidy/__main__.py
@@ -1,10 +1,52 @@
 import argparse
 import pathlib
+import os
+import shutil
+import subprocess
+import re
+from typing import List
+
 
 from run import run
 from generate_build_files import generate_build_files
 
 
+def clang_search_dirs() -> List[str]:
+    # Compilers are ordered based on fallback preference
+    # We pick the first one that is available on the system
+    compilers = ["clang", "gcc", "cpp", "cc"]
+    compilers = [c for c in compilers if shutil.which(c) is not None]
+    if len(compilers) == 0:
+        raise RuntimeError(f"None of {compilers} were found")
+    compiler = compilers[0]
+
+    result = subprocess.run(
+        [compiler, "-E", "-x", "c++", "-", "-v"],
+        stdin=subprocess.DEVNULL,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        check=True,
+    )
+    stderr = result.stderr.decode().strip().split("\n")
+    search_start = r"#include.*search starts here:"
+    search_end = r"End of search list."
+
+    append_path = False
+    search_paths = []
+    for line in stderr:
+        if re.match(search_start, line):
+            if append_path:
+                continue
+            else:
+                append_path = True
+        elif re.match(search_end, line):
+            break
+        elif append_path:
+            search_paths.append(line.strip())
+
+    return search_paths
+
+
 DEFAULTS = {
     "glob": [
         # The negative filters below are to exclude files that include onnx_pb.h or
@@ -32,7 +74,7 @@
         "-torch/csrc/deploy/interpreter/test_main.cpp",
     ],
     "paths": ["torch/csrc/"],
-    "include-dir": ["/usr/lib/llvm-11/include/openmp"],
+    "include-dir": ["/usr/lib/llvm-11/include/openmp"] + clang_search_dirs(),
 }
 
 
@@ -68,7 +110,8 @@ def parse_args() -> argparse.Namespace:
         help="Path to the folder containing compile_commands.json",
     )
     parser.add_argument(
-        "--diff-file", help="File containing diff to use for determining files to lint and line filters"
+        "--diff-file",
+        help="File containing diff to use for determining files to lint and line filters",
     )
     parser.add_argument(
         "-p",
@@ -103,7 +146,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--print-include-paths",
         action="store_true",
-        help="Print the search paths used for include directives"
+        help="Print the search paths used for include directives",
     )
     parser.add_argument(
         "-I",
@@ -112,8 +155,12 @@ def parse_args() -> argparse.Namespace:
         default=DEFAULTS["include-dir"],
         help="Add the specified directory to the search path for include files",
     )
-    parser.add_argument("-s", "--suppress-diagnostics", action="store_true",
-                        help="Add NOLINT to suppress clang-tidy violations")
+    parser.add_argument(
+        "-s",
+        "--suppress-diagnostics",
+        action="store_true",
+        help="Add NOLINT to suppress clang-tidy violations",
+    )
     parser.add_argument(
         "extra_args", nargs="*", help="Extra arguments to forward to clang-tidy"
     )
@@ -124,7 +171,23 @@ def main() -> None:
     if not pathlib.Path("build").exists():
         generate_build_files()
     options = parse_args()
-    run(options)
+
+    # Check if clang-tidy executable exists
+    exists = os.access(options.clang_tidy_exe, os.X_OK) or shutil.which(
+        options.clang_tidy_exe
+    )
+    if not exists:
+        msg = (
+            "Could not find 'clang-tidy' binary\n"
+            "You can install it by running:\n"
+            "   python3 tools/linter/install/clang_tidy.py"
+        )
+        print(msg)
+        exit(1)
+
+    return_code = run(options)
+    if return_code != 0:
+        raise RuntimeError("Warnings found in clang-tidy output!")
 
 
 main()
diff --git a/tools/linter/clang_tidy/run.py b/tools/linter/clang_tidy/run.py
index ea0babbae29d4..a0328a907d175 100644
--- a/tools/linter/clang_tidy/run.py
+++ b/tools/linter/clang_tidy/run.py
@@ -13,7 +13,6 @@
 """
 
 
-
 import collections
 import fnmatch
 import json
@@ -52,7 +51,9 @@ def map_filename(build_folder: str, fname: str) -> str:
     build_cpu_prefix = os.path.join(build_folder, native_cpu_prefix, "")
     default_arch_suffix = ".DEFAULT.cpp"
     if fname.startswith(native_cpu_prefix) and fname.endswith(".cpp"):
-        return f"{build_cpu_prefix}{fname[len(native_cpu_prefix):]}{default_arch_suffix}"
+        return (
+            f"{build_cpu_prefix}{fname[len(native_cpu_prefix):]}{default_arch_suffix}"
+        )
     if fname.startswith(build_cpu_prefix) and fname.endswith(default_arch_suffix):
         return f"{native_cpu_prefix}{fname[len(build_cpu_prefix):-len(default_arch_suffix)]}"
     return fname
@@ -165,11 +166,12 @@ def find_changed_lines(diff: str) -> Dict[str, List[Tuple[int, int]]]:
 
 def run_shell_commands_in_parallel(commands: Iterable[List[str]]) -> str:
     """runs all the commands in parallel with ninja, commands is a List[List[str]]"""
+
     async def run_command(cmd: List[str]) -> str:
         proc = await asyncio.create_subprocess_shell(
-            ' '.join(shlex.quote(x) for x in cmd),  # type: ignore[attr-defined]
+            " ".join(shlex.quote(x) for x in cmd),  # type: ignore[attr-defined]
             stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.PIPE
+            stderr=asyncio.subprocess.PIPE,
         )
         stdout, stderr = await proc.communicate()
         return f">>>\nstdout:\n{stdout.decode()}\nstderr:\n{stderr.decode()}\n<<<"
@@ -181,7 +183,9 @@ async def sem_task(task: Any) -> Any:
             async with semaphore:
                 return await task
 
-        return await asyncio.gather(*(sem_task(task) for task in tasks), return_exceptions=True)
+        return await asyncio.gather(
+            *(sem_task(task) for task in tasks), return_exceptions=True
+        )
 
     async def helper() -> Any:
         coros = [run_command(cmd) for cmd in commands]
@@ -192,7 +196,9 @@ async def helper() -> Any:
     return "\n".join(results)
 
 
-def run_clang_tidy(options: Any, line_filters: List[Dict[str, Any]], files: Iterable[str]) -> str:
+def run_clang_tidy(
+    options: Any, line_filters: List[Dict[str, Any]], files: Iterable[str]
+) -> str:
     """Executes the actual clang-tidy command in the shell."""
     command = [options.clang_tidy_exe, "-p", options.compile_commands_dir]
     if not options.config_file and os.path.exists(".clang-tidy"):
@@ -202,7 +208,10 @@ def run_clang_tidy(options: Any, line_filters: List[Dict[str, Any]], files: Iter
 
         with open(options.config_file) as config:
             # Here we convert the YAML config file to a JSON blob.
-            command += ["-config", json.dumps(yaml.load(config, Loader=yaml.SafeLoader))]
+            command += [
+                "-config",
+                json.dumps(yaml.load(config, Loader=yaml.SafeLoader)),
+            ]
     if options.print_include_paths:
         command += ["--extra-arg", "-v"]
     if options.include_dir:
@@ -215,7 +224,10 @@ def run_clang_tidy(options: Any, line_filters: List[Dict[str, Any]], files: Iter
         command += ["-line-filter", json.dumps(line_filters)]
 
     if options.parallel:
-        commands = [list(command) + [map_filename(options.compile_commands_dir, f)] for f in files]
+        commands = [
+            list(command) + [map_filename(options.compile_commands_dir, f)]
+            for f in files
+        ]
         output = run_shell_commands_in_parallel(commands)
     else:
         command += map_filenames(options.compile_commands_dir, files)
@@ -232,7 +244,9 @@ def run_clang_tidy(options: Any, line_filters: List[Dict[str, Any]], files: Iter
     return output
 
 
-def extract_warnings(output: str, base_dir: str = ".") -> Dict[str, Dict[int, Set[str]]]:
+def extract_warnings(
+    output: str, base_dir: str = "."
+) -> Dict[str, Dict[int, Set[str]]]:
     rc: Dict[str, Dict[int, Set[str]]] = {}
     for line in output.split("\n"):
         p = CLANG_WARNING_PATTERN.match(line)
@@ -258,17 +272,50 @@ def apply_nolint(fname: str, warnings: Dict[int, Set[str]]) -> None:
 
     line_offset = -1  # As in .cpp files lines are numbered starting from 1
     for line_no in sorted(warnings.keys()):
-        nolint_diagnostics = ','.join(warnings[line_no])
+        nolint_diagnostics = ",".join(warnings[line_no])
         line_no += line_offset
-        indent = ' ' * (len(lines[line_no]) - len(lines[line_no].lstrip(' ')))
-        lines.insert(line_no, f'{indent}// NOLINTNEXTLINE({nolint_diagnostics})\n')
+        indent = " " * (len(lines[line_no]) - len(lines[line_no].lstrip(" ")))
+        lines.insert(line_no, f"{indent}// NOLINTNEXTLINE({nolint_diagnostics})\n")
         line_offset += 1
 
     with open(fname, mode="w") as f:
         f.write("".join(lines))
 
 
-def run(options: Any) -> None:
+def filter_from_diff(
+    paths: List[str], diffs: List[str]
+) -> Tuple[List[str], List[Dict[Any, Any]]]:
+    files = []
+    line_filters = []
+
+    for diff in diffs:
+        changed_files = find_changed_lines(diff)
+        changed_files = {
+            filename: v
+            for filename, v in changed_files.items()
+            if any(filename.startswith(path) for path in paths)
+        }
+        line_filters += [
+            {"name": name, "lines": lines} for name, lines, in changed_files.items()
+        ]
+        files += list(changed_files.keys())
+
+    return files, line_filters
+
+
+def filter_from_diff_file(
+    paths: List[str], filename: str
+) -> Tuple[List[str], List[Dict[Any, Any]]]:
+    with open(filename, "r") as f:
+        diff = f.read()
+    return filter_from_diff(paths, [diff])
+
+
+def filter_default(paths: List[str]) -> Tuple[List[str], List[Dict[Any, Any]]]:
+    return get_all_files(paths), []
+
+
+def run(options: Any) -> int:
     # This flag is pervasive enough to set it globally. It makes the code
     # cleaner compared to threading it through every single function.
     global VERBOSE
@@ -276,25 +323,12 @@ def run(options: Any) -> None:
 
     # Normalize the paths first.
     paths = [path.rstrip("/") for path in options.paths]
+
     if options.diff_file:
-        with open(options.diff_file, "r") as f:
-            changed_files = find_changed_lines(f.read())
-            changed_files = {
-                filename: v
-                for filename, v in changed_files.items()
-                if any(filename.startswith(path) for path in options.paths)
-            }
-            line_filters = [
-                {"name": name, "lines": lines} for name, lines, in changed_files.items()
-            ]
-            files = list(changed_files.keys())
-            # Since header files are excluded, add .cpp file if it exists in the same folder
-            cpp_files = [f[:-1] + "cpp" for f in files if f.endswith(".h")]
-            cpp_files = [f for f in cpp_files if os.path.exists(f)]
-            files = list(set(files + cpp_files))
+        files, line_filters = filter_from_diff_file(options.paths, options.diff_file)
     else:
-        line_filters = []
-        files = get_all_files(paths)
+        files, line_filters = filter_default(options.paths)
+
     file_patterns = get_file_patterns(options.glob, options.regex)
     files = list(filter_files(files, file_patterns))
 
@@ -304,8 +338,13 @@ def run(options: Any) -> None:
         sys.exit()
 
     clang_tidy_output = run_clang_tidy(options, line_filters, files)
+    warnings = extract_warnings(
+        clang_tidy_output, base_dir=options.compile_commands_dir
+    )
     if options.suppress_diagnostics:
-        warnings = extract_warnings(clang_tidy_output, base_dir=options.compile_commands_dir)
+        warnings = extract_warnings(
+            clang_tidy_output, base_dir=options.compile_commands_dir
+        )
         for fname in warnings.keys():
             mapped_fname = map_filename(options.compile_commands_dir, fname)
             print(f"Applying fixes to {mapped_fname}")
@@ -318,4 +357,6 @@ def run(options: Any) -> None:
         print(clang_tidy_output)
     for line in clang_tidy_output.splitlines():
         if line.startswith(pwd):
-            print(line[len(pwd):])
+            print(line[len(pwd) :])
+
+    return len(warnings.keys())

From 711ded688de0e351eddfbe5751260fa23bc697c7 Mon Sep 17 00:00:00 2001
From: Elton Leander Pinto <eltonpinto@fb.com>
Date: Fri, 9 Jul 2021 12:50:57 -0700
Subject: [PATCH 035/122] Add a script to codemod max_tokens_total pragmas to
 C/C++ files (#61369)

Summary:
This PR adds a new script: `max_tokens_pragmas.py`

This is a utility script that can add/remove `max_tokens_total` pragmas from the codebase.

- [x] Implement script and test manually
- [x] Write test script

Examples:
First, change directories
```bash
cd tools/linter/clang_tidy
```

Then run the following:
```bash
cat << EOF > test/test1.cpp
// File without any prior pragmas

int main() {
    for (int i = 0; i < 10; i++);
    return 0;
}
EOF

cat << EOF > test/test2.cpp
// File with prior pragmas

#pragma clang max_tokens_total 1

int main() {
    for (int i = 0; i < 10; i++);
    return 0;
}
EOF

cat << EOF > test/test3.cpp
// File with multiple prior pragmas

#pragma clang max_tokens_total 1

// Different pragma; script should ignore this
#pragma clang max_tokens_here 20

int main() {
    for (int i = 0; i < 10; i++);
    return 0;
}

#pragma clang max_tokens_total 1
EOF

# Add pragmas to some files
python3 max_tokens_pragma.py --num-max-tokens 42 test/*.cpp
grep "#pragma clang max_tokens_total 42" test/*.cpp

# Remove pragmas from files
python3 max_tokens_pragma.py --strip test/*.cpp
grep "#pragma clang max_tokens_total 42" test/*.cpp # should fail

# Ignore files
python3 max_tokens_pragma.py --num-max-tokens 42 test/*.cpp --ignores test/test2.cpp
grep "#pragma clang max_tokens_total 42" test/*.cpp # should not list `test/test2.cpp`
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61369

Test Plan: `tools/linter/clang_tidy/test/test_max_tokens_pragma.py`

Reviewed By: malfet

Differential Revision: D29604291

Pulled By: 1ntEgr8

fbshipit-source-id: 3efe52573583769041a07e6776161d4d5bbf16a7
---
 .github/workflows/lint.yml                   |   2 +-
 tools/linter/clang_tidy/__init__.py          |   0
 tools/linter/clang_tidy/__main__.py          |   4 +-
 tools/linter/clang_tidy/max_tokens_pragma.py | 111 ++++++++++++++++
 tools/test/test_max_tokens_pragma.py         | 132 +++++++++++++++++++
 5 files changed, 246 insertions(+), 3 deletions(-)
 create mode 100644 tools/linter/clang_tidy/__init__.py
 create mode 100644 tools/linter/clang_tidy/max_tokens_pragma.py
 create mode 100644 tools/test/test_max_tokens_pragma.py

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index c16eed89297c0..4d12baa5610f7 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -335,7 +335,7 @@ jobs:
       - name: Run clang-tidy
         run: |
           cd "${GITHUB_WORKSPACE}"
-          python3 tools/linter/clang_tidy \
+          python3 -m tools.linter.clang_tidy \
             --diff-file pr.diff \
             --parallel \
             --verbose \
diff --git a/tools/linter/clang_tidy/__init__.py b/tools/linter/clang_tidy/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tools/linter/clang_tidy/__main__.py b/tools/linter/clang_tidy/__main__.py
index 3c36ffe7ac652..6567f434b64f2 100644
--- a/tools/linter/clang_tidy/__main__.py
+++ b/tools/linter/clang_tidy/__main__.py
@@ -7,8 +7,8 @@
 from typing import List
 
 
-from run import run
-from generate_build_files import generate_build_files
+from tools.linter.clang_tidy.run import run
+from tools.linter.clang_tidy.generate_build_files import generate_build_files
 
 
 def clang_search_dirs() -> List[str]:
diff --git a/tools/linter/clang_tidy/max_tokens_pragma.py b/tools/linter/clang_tidy/max_tokens_pragma.py
new file mode 100644
index 0000000000000..4f7b152659f77
--- /dev/null
+++ b/tools/linter/clang_tidy/max_tokens_pragma.py
@@ -0,0 +1,111 @@
+import argparse
+import re
+from typing import List
+
+
+# > Why is DEFAULT_MAX_TOKEN_COUNT set to 1?
+#
+# clang-tidy doesn't have a direct way to query for token counts in the
+# codebase. The workaround is to set the max token count to 1. This will cause
+# clang-tidy to output a warning with the actual token count of the file.
+#
+# A non-destructive way to set the max token count to 1 would be to pass it
+# through the -fmax-tokens option. However, this flag will be overridden if here
+# exists a #pragma max_tokens_total statement in the file. This necessitates a
+# destructive way to set the max token count to 1.
+DEFAULT_MAX_TOKEN_COUNT = 1
+MAX_TOKENS_CHECK_DIAG_NAME = "misc-max-tokens"
+MAX_TOKENS_PRAGMA_PATTERN = r"^#pragma\s+clang\s+max_tokens_total\s+(\d+)$"
+
+
+def add_max_tokens_pragma(code: str, num_max_tokens: int) -> str:
+    lines = code.splitlines()
+
+    found_pragma = False
+    pragma = f"#pragma clang max_tokens_total {num_max_tokens}"
+
+    for idx, line in enumerate(lines):
+        match = re.match(MAX_TOKENS_PRAGMA_PATTERN, line.strip())
+        if match:
+            found_pragma = True
+            token_count = match.group(1)
+            if int(token_count) != num_max_tokens:
+                lines[idx] = pragma
+
+    if not found_pragma:
+        lines = [pragma] + lines
+
+    return "\n".join(lines)
+
+
+def strip_max_tokens_pragmas(code: str) -> str:
+    lines = code.splitlines()
+    lines = [
+        line
+        for line in lines
+        if re.match(MAX_TOKENS_PRAGMA_PATTERN, line.strip()) is None
+    ]
+    return "\n".join(lines)
+
+
+def add_max_tokens_pragma_to_files(files: List[str], num_max_tokens: int) -> None:
+    for filename in files:
+        with open(filename, "r+") as f:
+            data = f.read()
+            data = add_max_tokens_pragma(data, num_max_tokens)
+
+            f.seek(0)
+            f.write(data)
+            f.truncate()
+
+
+def strip_max_tokens_pragma_from_files(files: List[str]) -> None:
+    for filename in files:
+        with open(filename, "r+") as f:
+            data = f.read()
+            data = strip_max_tokens_pragmas(data)
+
+            f.seek(0)
+            f.write(data)
+            f.truncate()
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Add max_tokens_total pragmas to C/C++ source files"
+    )
+    parser.add_argument(
+        "-n",
+        "--num-max-tokens",
+        default=DEFAULT_MAX_TOKEN_COUNT,
+        help="Set the token count to this value",
+        type=int,
+    )
+    parser.add_argument(
+        "files", nargs="+", help="Add max_tokens_total pragmas to the specified files"
+    )
+    parser.add_argument(
+        "-i", "--ignore", nargs="+", default=[], help="Ignore the specified files"
+    )
+    parser.add_argument(
+        "-s",
+        "--strip",
+        action="store_true",
+        help="Remove max_tokens_total pragmas from the input files",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    options = parse_args()
+
+    ignored = set(options.ignore)
+    files = [filename for filename in options.files if filename not in ignored]
+    if options.strip:
+        strip_max_tokens_pragma_from_files(files)
+    else:
+        add_max_tokens_pragma_to_files(files, options.num_max_tokens)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/test/test_max_tokens_pragma.py b/tools/test/test_max_tokens_pragma.py
new file mode 100644
index 0000000000000..746b51e39d030
--- /dev/null
+++ b/tools/test/test_max_tokens_pragma.py
@@ -0,0 +1,132 @@
+import unittest
+from tools.linter.clang_tidy.max_tokens_pragma import (
+    add_max_tokens_pragma,
+    strip_max_tokens_pragmas,
+)
+
+
+def compare_code(a: str, b: str) -> bool:
+    a_lines = [line.strip() for line in a.splitlines()]
+    b_lines = [line.strip() for line in b.splitlines()]
+    return a_lines == b_lines
+
+
+class TestMaxTokensPragma(unittest.TestCase):
+    def test_no_prior_pragmas(self) -> None:
+        input = """\
+        // File without any prior pragmas
+
+        int main() {
+          for (int i = 0; i < 10; i++);
+          return 0;
+        }
+        """
+
+        expected = """\
+        #pragma clang max_tokens_total 42
+        // File without any prior pragmas
+
+        int main() {
+          for (int i = 0; i < 10; i++);
+          return 0;
+        }
+        """
+        output = add_max_tokens_pragma(input, 42)
+        self.assertTrue(compare_code(output, expected))
+
+        output = strip_max_tokens_pragmas(output)
+        self.assertTrue(compare_code(output, input))
+
+    def test_single_prior_pragma(self) -> None:
+        input = """\
+        // File with prior pragmas
+
+        #pragma clang max_tokens_total 1
+
+        int main() {
+          for (int i = 0; i < 10; i++);
+          return 0;
+        }
+        """
+
+        expected = """\
+        // File with prior pragmas
+
+        #pragma clang max_tokens_total 42
+
+        int main() {
+          for (int i = 0; i < 10; i++);
+          return 0;
+        }
+        """
+        stripped = """\
+        // File with prior pragmas
+
+
+        int main() {
+          for (int i = 0; i < 10; i++);
+          return 0;
+        }
+        """
+
+        output = add_max_tokens_pragma(input, 42)
+        self.assertTrue(compare_code(output, expected))
+
+        output = strip_max_tokens_pragmas(output)
+        self.assertTrue(compare_code(output, stripped))
+
+    def test_multiple_prior_pragmas(self) -> None:
+        input = """\
+        // File with multiple prior pragmas
+
+        #pragma clang max_tokens_total 1
+
+        // Different pragma; script should ignore this
+        #pragma clang max_tokens_here 20
+
+        int main() {
+          for (int i = 0; i < 10; i++);
+          return 0;
+        }
+
+        #pragma clang max_tokens_total 1
+        """
+
+        expected = """\
+        // File with multiple prior pragmas
+
+        #pragma clang max_tokens_total 42
+
+        // Different pragma; script should ignore this
+        #pragma clang max_tokens_here 20
+
+        int main() {
+          for (int i = 0; i < 10; i++);
+          return 0;
+        }
+
+        #pragma clang max_tokens_total 42
+        """
+        stripped = """\
+        // File with multiple prior pragmas
+
+
+        // Different pragma; script should ignore this
+        #pragma clang max_tokens_here 20
+
+        int main() {
+          for (int i = 0; i < 10; i++);
+          return 0;
+        }
+
+        """
+
+        output = add_max_tokens_pragma(input, 42)
+        self.assertTrue(compare_code(output, expected))
+
+        output = strip_max_tokens_pragmas(output)
+        self.assertTrue(compare_code(output, stripped))
+
+
+if __name__ == "__main__":
+    unittest.main()

From a5c5b56cf57869b4e545c9339e2c2fbc0f21986b Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Fri, 9 Jul 2021 13:30:29 -0700
Subject: [PATCH 036/122] gen ExclusivelyOwned in structured kernels (#59827)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/59827

ghstack-source-id: 133089541

Test Plan: existing CI

Reviewed By: ezyang, janeyx99

Differential Revision: D28965922

fbshipit-source-id: ffbc1d43e5d3ab3abfad3b0830b4da1ce899f505
---
 aten/src/ATen/templates/RegisterDispatchKey.cpp |  1 +
 tools/codegen/dest/register_dispatch_key.py     | 16 ++++++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp
index 5eac5c51965f9..c702a68063c31 100644
--- a/aten/src/ATen/templates/RegisterDispatchKey.cpp
+++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp
@@ -15,6 +15,7 @@
 #include <ATen/Utils.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/Dispatch.h>
+#include <c10/util/ExclusivelyOwned.h>
 #include <c10/util/Half.h>
 #include <c10/core/TensorImpl.h>
 #include <c10/core/UndefinedTensorImpl.h>
diff --git a/tools/codegen/dest/register_dispatch_key.py b/tools/codegen/dest/register_dispatch_key.py
index 775a2cf307583..fc2af8fc13931 100644
--- a/tools/codegen/dest/register_dispatch_key.py
+++ b/tools/codegen/dest/register_dispatch_key.py
@@ -314,12 +314,13 @@ def gen_class_set_output(self, k: SchemaKind, parent_class: str, generate_super:
             set_output_super = f"{parent_class}::set_output(output_idx, sizes, strides, options, names);"
         else:
             set_output_super = ""
+        maybe_star = "*" if k is SchemaKind.functional else ""
         return f"""
 void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides,
                 TensorOptions options, DimnameList names) override {{
 {textwrap.indent(self.gen_class_set_output_body(k), "    ")}
     if (!names.empty()) {{
-      namedinference::propagate_names(outputs_[output_idx], names);
+      namedinference::propagate_names({maybe_star}outputs_[output_idx], names);
     }}
     // super must happen after, so that downstream can use maybe_get_output
     // to retrieve the output
@@ -417,8 +418,10 @@ def gen_class_ctor(self, k: SchemaKind, class_name: str, returns: int) -> str:
     def gen_class(
         self, f: NativeFunction, k: SchemaKind, *, class_name: str, parent_class: str, generate_super: bool
     ) -> str:
+        maybe_star = ''
         if k is SchemaKind.functional:
-            output_type = "Tensor"
+            output_type = "c10::ExclusivelyOwned<Tensor>"
+            maybe_star = '*'
         elif k is SchemaKind.inplace:
             output_type = "std::reference_wrapper<Tensor>"
         elif k is SchemaKind.out:
@@ -441,7 +444,7 @@ def gen_class(
             f"{textwrap.indent(class_ctor_str, indent)}",
             f"{textwrap.indent(self.gen_class_set_output(k, parent_class, generate_super), indent)}",
             "    const Tensor& maybe_get_output(int64_t output_idx) override {",
-            "        return outputs_[output_idx];",
+            f"        return {maybe_star}outputs_[output_idx];",
             "    }",
             f"    std::array<{output_type}, {len(f.func.returns)}> outputs_;",
             f"{textwrap.indent(guard_field, indent)}",
@@ -555,10 +558,11 @@ def generate_defn(cpp_sig: CppSignature) -> str:
             # After running meta, op.outputs_ is guaranteed to be valid;
             # add it to the context
             out_args = structured.out_arguments(self.g)
+            maybe_star = '*' if k is SchemaKind.functional else ''
             for i, out_arg in enumerate(out_args):
                 assert ConstRefCType(BaseCType(tensorT)) == out_arg.nctype.type
                 context.append(Expr(
-                    expr=f"op.outputs_[{i}]",
+                    expr=f"{maybe_star}op.outputs_[{i}]",
                     # TODO: Stop hardcoding that the output type is a Tensor.  Note
                     # that for the codegen here this is fine because outputs_ is
                     # hardcoded to be tensor already
@@ -605,9 +609,9 @@ def generate_defn(cpp_sig: CppSignature) -> str:
             # TODO: Do this in translate instead
             if k is SchemaKind.functional:
                 if len(f.func.returns) == 1:
-                    ret_expr = "std::move(op.outputs_[0])"  # small optimization
+                    ret_expr = "std::move(op.outputs_[0]).take()"  # small optimization
                 else:
-                    moved = ', '.join(f"std::move(op.outputs_[{i}])" for i in range(len(f.func.returns)))
+                    moved = ', '.join(f"std::move(op.outputs_[{i}]).take()" for i in range(len(f.func.returns)))
                     ret_expr = f"std::make_tuple({moved})"
             elif k is SchemaKind.inplace:
                 ret_expr = "self"

From fb7ed24f6e806924f92e587f0708a19fc22b9e67 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Fri, 9 Jul 2021 13:30:29 -0700
Subject: [PATCH 037/122] [PyTorch] Try using ExclusivelyOwned in LinearAlgebra
 (#59420)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/59420

This is a sample of how we might use ExclusivelyOwned on an opt-in basis.
ghstack-source-id: 133089540

Test Plan:
1) CI to run regression tests
2) Spot-checked assembly for linalg_det_out. Rather than calling the intrusive_ptr dtor, we get the ExclusivelyOwned dtor inline. In particular, we do not get any atomic refcount decrement instructions emitted.
3) TODO: some kind of perf profiling; advice welcome

Reviewed By: ezyang

Differential Revision: D28885313

fbshipit-source-id: ae4b39ed738c41d0c4a4509a5199c040ba9aa63a
---
 aten/src/ATen/native/LinearAlgebra.cpp | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 0600f0aeb1175..4a67bf3240ea2 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -64,7 +64,7 @@ DEFINE_DISPATCH(linalg_vector_norm_stub);
 // det(P) = \pm 1, this method returns a 3-tuple:
 //   (det(P), diag(U), info),
 // where info helps us identify singular matrices.
-static inline std::tuple<Tensor, Tensor> _lu_det_P_diag_U(const Tensor& self) {
+static inline std::tuple<c10::ExclusivelyOwned<Tensor>, c10::ExclusivelyOwned<Tensor>> _lu_det_P_diag_U(const Tensor& self) {
   Tensor pivs, lu, infos;
   std::tie(lu, pivs, infos) = at::_lu_with_info(self, /*pivot=*/true, /*check_errors=*/false);
   TORCH_CHECK(infos.ge(0).all().item<uint8_t>(), "Invalid argument passed to lu");
@@ -72,7 +72,8 @@ static inline std::tuple<Tensor, Tensor> _lu_det_P_diag_U(const Tensor& self) {
   auto num_exchanges = (at::arange(1, n + 1, pivs.options()) != pivs)
     .sum(-1, /*keepdim=*/false, /*dtype=*/at::kLong).fmod_(2);
   auto u_diagonal = lu.diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1);
-  return std::tuple<Tensor, Tensor>(num_exchanges.mul_(-2).add_(1), u_diagonal);
+  num_exchanges.mul_(-2).add_(1);
+  return std::make_tuple(c10::ExclusivelyOwned<Tensor>(std::move(num_exchanges)), c10::ExclusivelyOwned<Tensor>(std::move(u_diagonal)));
 }
 
 // torch.det, alias for torch.linalg.det
@@ -90,12 +91,12 @@ Tensor& linalg_det_out(const Tensor& self, Tensor& out) {
   IntArrayRef out_sizes(self.sizes().data(), self.dim() - 2);
   at::native::resize_output(out, out_sizes);
 
-  Tensor det_P, diag_U;
+  c10::ExclusivelyOwned<Tensor> det_P, diag_U;
   std::tie(det_P, diag_U) = _lu_det_P_diag_U(self);
   // complete_det is 0 when U is singular (U(i, i) = 0 for some i in [1, self.size(-1)]).
   // The product accumulation takes care of this case, and hence no special case handling is required.
-  at::prod_out(out, diag_U, -1);
-  out.mul_(det_P);
+  at::prod_out(out, *diag_U, -1);
+  out.mul_(*det_P);
   return out;
 }
 
@@ -110,14 +111,14 @@ Tensor logdet(const Tensor& self) {
   TORCH_CHECK((at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type())),
               "Expected a floating point tensor as input");
 
-  Tensor det_P, diag_U;
+  c10::ExclusivelyOwned<Tensor> det_P, diag_U;
   std::tie(det_P, diag_U) = _lu_det_P_diag_U(self);
-  Tensor det_sign = diag_U.sign().prod(-1).mul_(det_P);
+  Tensor det_sign = diag_U->sign().prod(-1).mul_(*det_P);
 
   // If det_sign > 0, diag_U.abs_().log_().sum(-1) gives logdet (this means U is not singular).
   // If det_sign <= 0, then we get proper nan (when det < 0, i.e., det_sign) or -inf (when det = 0, i.e., U is singular).
   // U is singular when U(i, i) = 0 for some i in [1, self.size(-1)].
-  Tensor logdet_vals = diag_U.abs_().log_().sum(-1);
+  Tensor logdet_vals = diag_U->abs_().log_().sum(-1);
   if (self.dim() > 2) {
     auto indices = toListOfOptionalTensors((det_sign < 0).nonzero_numpy());
     // NOLINTNEXTLINE(performance-move-const-arg)
@@ -134,14 +135,14 @@ std::tuple<Tensor, Tensor> linalg_slogdet(const Tensor& self) {
   TORCH_CHECK(t == ScalarType::Double || t == ScalarType::Float || t == ScalarType::ComplexFloat || t == ScalarType::ComplexDouble,
               "linalg_slogdet: expected a tensor of float, double, cfloat or cdouble types but got ", t);
 
-  Tensor det_P, diag_U;
+  c10::ExclusivelyOwned<Tensor> det_P, diag_U;
   std::tie(det_P, diag_U) = _lu_det_P_diag_U(self);
-  auto det_sign = diag_U.sgn().prod(-1).mul_(det_P);
+  auto det_sign = diag_U->sgn().prod(-1).mul_(*det_P);
   // abslogdet_val is -inf if U is singular, in which case diag_U.abs_().log_().sum(-1) will return -inf.
   // U is singular when U(i, i) = 0 for some i in [1, self.size(-1)].
   // Since abslogdet_val cannot take nan, no special case handling is required.
   // in-place abs is not supported for complex tensors
-  auto abslogdet_val = isComplexType(t) ? diag_U.abs().log_().sum(-1) : diag_U.abs_().log_().sum(-1);
+  auto abslogdet_val = isComplexType(t) ? diag_U->abs().log_().sum(-1) : diag_U->abs_().log_().sum(-1);
   return std::make_tuple(det_sign, abslogdet_val);
 }
 

From 4f4beb8286d625b6dca9948df17c2fab0922ca57 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 9 Jul 2021 13:32:13 -0700
Subject: [PATCH 038/122] Add Model Parallel Support to ZeRO (#61370)

Summary:
**Overview:**
The existing `ZeroRedundancyOptimizer` implementation assumes that all model parameters are stored on the same device (due to the recent [refactor](https://github.com/pytorch/pytorch/pull/59834)). This change allows model parameters to be sharded across multiple devices, as in the DDP with Model Parallelism example [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html).

The only logic affected is the bucketing strategy used when `parameters_as_bucket_view=True`. Let `n` denote the world size and `k` denote the number of devices per process.
- Previously, `k = 1`, and `self._buckets` was a `List[torch.Tensor]`, where `self._buckets[j]` is a tensor (i.e. bucket) containing the parameters assigned to rank `j` for `j = 0, ..., n - 1`.
- Now, `self._buckets` is a `List[List[torch.Tensor]]`, where `self._buckets[i][j]` is a tensor containing the parameters stored on device `i` assigned to rank `j` for `i = 0, ..., k - 1` and `j = 0, ..., n - 1`.

This bucket construction uses an auxiliary data structure `self._device_to_per_rank_params`, which is a `Dict[torch.device, List[List[torch.Tensor]]]`. It maps:
- `dev_0` to `[rank 0's assigned parameters on dev_0, rank 1's assigned parameters on dev_1, ...]`,
- `...`
- `dev_{k-1}` to `[rank 0's assigned parameters on dev_{k-1}, rank 1's assigned parameters on dev_{k-1}, ...]`

I removed the invariant checker `_verify_same_param_device()` and its corresponding test since it is no longer an invariant.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61370

Test Plan: I added a new test `test_zero_model_parallel()` that checks for parity between a DDP model with model parallelism using `ZeroRedundancyOptimizer` and a local model with the same architecture using a local optimizer. I also verified that the existing tests still pass.

Reviewed By: soulitzer

Differential Revision: D29637132

Pulled By: andwgu

fbshipit-source-id: 07112959fa4e94a3f40e67e88cbb58ce3cd1e033
---
 .../optim/test_zero_redundancy_optimizer.py   | 120 ++++++++++++--
 .../optim/zero_redundancy_optimizer.py        | 155 +++++++++---------
 2 files changed, 184 insertions(+), 91 deletions(-)

diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py
index a18de8acace72..53793fa7ca979 100644
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@@ -248,22 +248,6 @@ def test_same_dense_param_type(self):
             with self.assertRaises(ValueError):
                 ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1)
 
-    def test_same_param_device(self):
-        """Check that ZeroRedundancyOptimizer raises an exception if the input
-        parameters are sharded on multiple devices.
-
-        NOTE: This test should be removed once support for sharding a rank's
-        model parameters across multiple devices is added.
-        """
-        if not torch.cuda.is_available() or torch.cuda.device_count() < 2:
-            return
-        self.dist_init(self.rank)
-
-        # Move the parameters to cuda:0 and cuda:1 respectively
-        params = [torch.Tensor(1).to(0), torch.Tensor(1).to(1)]
-        with self.assertRaises(ValueError):
-            ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=0.1)
-
 
 class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer):
     @property
@@ -791,6 +775,110 @@ def test_zero_join_cpu(self):
         """Check that the ZeRO join hook allows training with uneven inputs on CPU."""
         self._test_zero_join(torch.device("cpu"))
 
+    def _test_zero_model_parallel(self, parameters_as_bucket_view: bool):
+        # Use two processes each with two GPUs
+        assert self.rank < 2
+        NUM_EPOCHS = 3
+        NUM_INPUTS = 5
+        LR = 0.01
+        torch.manual_seed(0)
+        torch.cuda.manual_seed(0)
+
+        class ModelParallelModel(torch.nn.Module):
+            def __init__(self, dev0, dev1):
+                super().__init__()
+                self.dev0 = dev0
+                self.dev1 = dev1
+                self.net0 = torch.nn.Linear(10, 10).to(dev0)
+                self.relu = torch.nn.ReLU()
+                self.net1 = torch.nn.Linear(10, 5).to(dev1)
+
+            def forward(self, x):
+                x = x.to(self.dev0)
+                x = self.relu(self.net0(x))
+                x = x.to(self.dev1)
+                return self.net1(x)
+
+        class LocalModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.net0 = torch.nn.Linear(10, 10)
+                self.relu = torch.nn.ReLU()
+                self.net1 = torch.nn.Linear(10, 5)
+
+            def forward(self, x):
+                return self.net1(self.relu(self.net0(x)))
+
+        dev0 = 2 * self.rank
+        dev1 = 2 * self.rank + 1
+        mp_model = ModelParallelModel(dev0, dev1)
+        ddp_model = DDP(mp_model)
+        local_model = LocalModel()
+        cpu_device = torch.device("cpu")
+        # Ensure the parameters are the same across the two models
+        local_model.net0.weight = torch.nn.Parameter(mp_model.net0.weight.detach().clone().to(cpu_device))
+        local_model.net0.bias = torch.nn.Parameter(mp_model.net0.bias.detach().clone().to(cpu_device))
+        local_model.net1.weight = torch.nn.Parameter(mp_model.net1.weight.detach().clone().to(cpu_device))
+        local_model.net1.bias = torch.nn.Parameter(mp_model.net1.bias.detach().clone().to(cpu_device))
+
+        # Compare parity between DDP with model parallelism using ZeRO and
+        # a local model using a local optimizer
+        zero_optim = ZeroRedundancyOptimizer(
+            ddp_model.parameters(),
+            optimizer_class=torch.optim.Adam,
+            parameters_as_bucket_view=parameters_as_bucket_view,
+            lr=LR
+        )
+        local_optim = torch.optim.Adam(local_model.parameters(), lr=LR)
+        inputs = [torch.randn(20, 10) for _ in range(NUM_INPUTS)]
+
+        for _ in range(NUM_EPOCHS):
+            for input in inputs:
+                def closure_local():
+                    local_optim.zero_grad()
+                    local_loss = local_model(input).abs().sum()
+                    local_loss.backward()
+                    return local_loss
+
+                def closure_ddp():
+                    zero_optim.zero_grad()
+                    ddp_loss = ddp_model(input).abs().sum()
+                    ddp_loss.backward()
+                    return ddp_loss
+
+                local_loss = cast(torch.Tensor, local_optim.step(closure=closure_local))
+                ddp_loss = cast(torch.Tensor, zero_optim.step(closure=closure_ddp)).to(cpu_device)
+
+                assert torch.allclose(
+                    local_loss, ddp_loss
+                ), "Losses differ between local optim and ZeroRedundancyOptimizer"
+
+                for local_p, ddp_p in zip(local_model.parameters(), ddp_model.parameters()):
+                    ddp_p = ddp_p.to(cpu_device)
+                    assert torch.allclose(local_p, ddp_p), "Models differ after a step"
+
+    @common_distributed.skip_if_lt_x_gpu(4)
+    def test_zero_model_parallel_with_bucket_view(self):
+        """
+        Check that ZeRO works with model parallelism where layers are sharded
+        across devices when ``parameters_as_bucket_view=True``.
+        """
+        if self.rank >= 2:
+            return
+        self.dist_init(self.rank, world_size=2)
+        self._test_zero_model_parallel(parameters_as_bucket_view=True)
+
+    @common_distributed.skip_if_lt_x_gpu(4)
+    def test_zero_model_parallel_without_bucket_view(self):
+        """
+        Check that ZeRO works with model parallelism where layers are sharded
+        across devices when ``parameters_as_bucket_view=False``.
+        """
+        if self.rank >= 2:
+            return
+        self.dist_init(self.rank, world_size=2)
+        self._test_zero_model_parallel(parameters_as_bucket_view=False)
+
 
 if __name__ == "__main__":
     # ! unittest should not be used here, else the tests are not properly registered
diff --git a/torch/distributed/optim/zero_redundancy_optimizer.py b/torch/distributed/optim/zero_redundancy_optimizer.py
index 24921e40942ad..7a4f7f2863664 100644
--- a/torch/distributed/optim/zero_redundancy_optimizer.py
+++ b/torch/distributed/optim/zero_redundancy_optimizer.py
@@ -179,9 +179,9 @@ class ZeroRedundancyOptimizer(Optimizer):
         >>> ddp(inputs).sum().backward()
         >>> opt.step()
 
-    .. note: Currently, ``ZeroRedundancyOptimizer`` requires that all of the
-        passed-in parameters are on the same device and that they are the same
-        dense type.
+    .. warning:
+        Currently, ``ZeroRedundancyOptimizer`` requires that all of the
+        passed-in parameters are the same dense type.
 
     .. warning: ZeroRedundancyOptimizer is experimental and subject to change.
 
@@ -199,7 +199,6 @@ def __init__(
     ):
         # Perform type and assumption checks on the input parameters
         self._verify_and_init_params(params)
-        self._verify_same_param_device()
         self._verify_same_dense_param_type()
 
         # NOTE: The parent constructor uses `add_param_group()` which is
@@ -217,6 +216,7 @@ def __init__(
         self._param_to_index_cache: Dict[torch.Tensor, int] = {}
         self._partition_parameters_cache: List[List[Dict]] = []
         self._index_to_param_cache: List[torch.Tensor] = []
+        self._device_to_per_rank_params_cache: Dict[torch.device, List[List[torch.Tensor]]] = {}
 
         # Default device for collective communication and buckets
         self._default_device = self._all_params[0].device
@@ -232,7 +232,7 @@ def __init__(
 
         self.parameters_as_bucket_view = parameters_as_bucket_view
         self._is_trainable_mask = self._get_is_trainable_mask()
-        self._buckets: List[torch.Tensor] = []
+        self._buckets: List[List[torch.Tensor]] = []
         self._build_param_buckets()
 
         # Optional consolidated optimizer state, only populated if this rank
@@ -249,6 +249,7 @@ def _clear_cache(self) -> None:
         self._param_to_rank_cache.clear()
         self._index_to_param_cache.clear()
         self._param_to_index_cache.clear()
+        self._device_to_per_rank_params_cache.clear()
 
     def add_param_group(self, param_group: dict) -> None:
         r"""
@@ -385,7 +386,7 @@ def _partition_parameters(self) -> List[List[Dict]]:
     @property
     def _param_to_rank(self) -> Dict[torch.Tensor, int]:
         r"""
-        Hash table mapping parameters to their assigned data parallel rank in
+        Dict mapping parameters to their assigned data parallel rank in
         the partition.
         """
         if len(self._param_to_rank_cache) == 0:
@@ -398,7 +399,7 @@ def _param_to_rank(self) -> Dict[torch.Tensor, int]:
     @property
     def _param_to_index(self) -> Dict[torch.Tensor, int]:
         r"""
-        Hash table mapping parameters to their indices in the global optimizer
+        Dict mapping parameters to their indices in the global optimizer
         state.
 
         NOTE: This assumes that the global optimizer state's indexing (in
@@ -411,7 +412,7 @@ def _param_to_index(self) -> Dict[torch.Tensor, int]:
         return self._param_to_index_cache
 
     @property
-    def _index_to_param(self) -> Dict[int, torch.Tensor]:
+    def _index_to_param(self) -> List[torch.Tensor]:
         r"""
         List mapping parameter indices in the global optimizer scheme to the
         actual params.
@@ -431,12 +432,13 @@ def _sync_parameters(self):
         """
         handles = []
         if self.parameters_as_bucket_view:
-            for rank, bucket in enumerate(self._buckets):
-                global_rank = _get_global_rank(self.process_group, rank)
-                handles.append(
-                    dist.broadcast(tensor=bucket, src=global_rank,
-                                   group=self.process_group, async_op=True)
-                )
+            for dev_i_buckets in self._buckets:
+                for rank, bucket in enumerate(dev_i_buckets):
+                    global_rank = _get_global_rank(self.process_group, rank)
+                    handles.append(
+                        dist.broadcast(tensor=bucket, src=global_rank,
+                                       group=self.process_group, async_op=True)
+                    )
         else:
             for rank, param_groups in enumerate(self._partition_parameters()):
                 global_rank = _get_global_rank(self.process_group, rank)
@@ -448,6 +450,33 @@ def _sync_parameters(self):
                         )
         _ = list(map(lambda x: x.wait(), handles))
 
+    @property
+    def _device_to_per_rank_params(self) -> Dict[torch.device, List[List[torch.Tensor]]]:
+        r"""
+        Dict mapping device to a list of the per-rank parameter lists
+        containing the parameters stored on the device.
+
+        Let ``dev_i`` denote the ``i``th device for this rank. Then:
+        ``dev_0`` maps to a list containing:
+            rank 0's assigned parameters stored on ``dev_0``,
+            rank 1's assigned parameters stored on ``dev_0``,
+            ...
+        ``dev_1`` maps to a list containing:
+            rank 0's assigned parameters stored on ``dev_1``,
+            rank 1's assigned parameters stored on ``dev_1``,
+            ...
+        ...
+        """
+        if len(self._device_to_per_rank_params_cache) == 0:
+            for rank, param_groups in enumerate(self._partition_parameters()):
+                for param_group in param_groups:
+                    for param in param_group["params"]:
+                        device = param.device
+                        if device not in self._device_to_per_rank_params_cache:
+                            self._device_to_per_rank_params_cache[device] = [[] for _ in range(self.world_size)]
+                        self._device_to_per_rank_params_cache[device][rank].append(param)
+        return self._device_to_per_rank_params_cache
+
     def step(
         self,
         closure: Optional[Callable[[], float]] = None,
@@ -601,32 +630,38 @@ def _sync_param_groups(
 
     def _build_param_buckets(self) -> None:
         r"""
-        Builds parameter buckets so that for each device that stores this
-        rank's parameters, there is a bucket (represented as a tensor)
-        containing all of the parameters on that device that are assigned to a
-        given rank, if ``parameters_as_bucket_view`` is enabled.
+        Builds parameter buckets if ``parameters_as_bucket_view`` is enabled so
+        that for each device that stores this rank's parameters, there is a
+        bucket (represented as a tensor) containing all of the parameters on
+        that device that are assigned to a given rank in the parameter update
+        partition.
 
         This function is called in the constructor and any time parameter
         trainability is changed.
 
-        NOTE: The current implementation assumes that each rank stores all of
-        its parameters (i.e. ``self._all_params``) on a single device. This
-        means that there should be exactly ``world_size``-many buckets.
+        .. warning::
+            The current implementation assumes that all of the parameters in a
+            bucket are of the same dense type when allocating the bucket's
+            tensor.
 
-        NOTE: The current implementation assumes that all of the parameters in
-        a bucket are of the same dense type when allocating the bucket's
-        tensor.
+        .. warning::
+            If the model parameters are stored across more than one device,
+            then the storage partitioning must be the same across all
+            processes in order for parameter synchronization to work.
         """
         if not self.parameters_as_bucket_view:
             return
-        for rank, param_groups in enumerate(self._partition_parameters()):
-            # Find the bucket size and dtype, compile the trainable
-            # parameters, and clone the non-trainable parameters
-            bucket_size = 0
-            dtype = None
-            trainable_params = []
-            for param_group in param_groups:
-                for param in param_group["params"]:
+
+        # Bucket B_{i,j}: parameters stored on dev_i assigned to rank j
+        num_devices = len(self._device_to_per_rank_params)
+        self._buckets = [[] for _ in range(num_devices)]
+
+        for dev_i, (device, param_lists) in enumerate(self._device_to_per_rank_params.items()):
+            for params in param_lists:
+                bucket_size = 0
+                dtype = None
+                trainable_params = []
+                for param in params:
                     if not _is_trainable(param):
                         # Clone in case the parameter was previously part of
                         # a bucket to avoid the data from being destroyed
@@ -635,26 +670,20 @@ def _build_param_buckets(self) -> None:
                         bucket_size += param.numel()
                         trainable_params.append(param)
                     dtype = param.dtype  # assumes all same dtype
-            device = self._default_device  # assumes all on single device
 
-            if bucket_size == 0:
-                # Create a dummy bucket if there are no parameters
-                bucket = torch.zeros(1, device=device)
-            else:
-                # Construct the bucket (assuming all dense and same dtype)
-                bucket = torch.empty(bucket_size, dtype=dtype, device=device)
-                offset = 0
-                for param in trainable_params:
-                    offset_next = offset + param.numel()
-                    bucket[offset:offset_next].copy_(param.data.flatten())
-                    param.data = bucket[offset:offset_next].view_as(param.data)
-                    offset = offset_next
-
-            # Either replace the existing bucket or create it
-            if len(self._buckets) != rank:
-                self._buckets[rank] = bucket
-            else:
-                self._buckets.append(bucket)
+                if bucket_size == 0:
+                    # Create a dummy bucket if there are no parameters
+                    bucket = torch.zeros(1, device=device)
+                else:
+                    # Construct the bucket (assuming all dense and same dtype)
+                    bucket = torch.empty(bucket_size, dtype=dtype, device=device)
+                    offset = 0
+                    for param in trainable_params:
+                        offset_next = offset + param.numel()
+                        bucket[offset:offset_next].copy_(param.data.flatten())
+                        param.data = bucket[offset:offset_next].view_as(param.data)
+                        offset = offset_next
+                self._buckets[dev_i].append(bucket)
 
     def _verify_and_init_params(self, params: Any) -> None:
         r"""
@@ -687,30 +716,6 @@ def _verify_and_init_params(self, params: Any) -> None:
                                 "Tensors, but got an iterable containing "
                                 f"{torch.typename(param)}")
 
-    def _verify_same_param_device(self) -> None:
-        r"""
-        Verifies that ZeRO is being used under the single-process single-
-        device regime where a process operates exclusively on a full model
-        replica on a single device.
-
-        The function assumes that ``self._all_params`` has been initialized
-        and is non-empty.
-
-        Raises:
-            ValueError: ``params`` contains parameters across multiple
-                devices.
-
-        NOTE: This function can be removed once support for sharding a rank's
-        model parameters across multiple devices is added.
-        """
-        device = self._all_params[0].device
-        for param in self._all_params[1:]:
-            if param.device != device:
-                raise ValueError("ZeroRedundancyOptimizer assumes that each "
-                                 "rank's model parameters are on the same "
-                                 f"device but got both {device} and "
-                                 f"{param.device}")
-
     def _verify_same_dense_param_type(self) -> None:
         r"""
         Verifies that all parameters are of the same dense type.

From 35b950ea984a0a90ff822c8c2b417d07e4ab9d03 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Fri, 9 Jul 2021 13:50:42 -0700
Subject: [PATCH 039/122] [package] properly handle case where we are
 re-packaging mocked modules (#61434)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61434

Mocking is the only time we introduce a "special" module to a
torch.package of our own creation. This interacts poorly with
re-packaging, since if we treat `_mock` as a regular module and try to
package it normally we will produce a broken package.

This PR teaches PackageExporter to recognize `_mock` modules and treat
them specially during the dependency walking process, thus avoiding the
issue.

Test Plan: Imported from OSS

Reviewed By: jdonald, Lilyjjo

Differential Revision: D29638283

Pulled By: suo

fbshipit-source-id: 37a7ffa34da8bb665f679fbd72aa3d71154b2209
---
 test/package/test_dependency_api.py | 31 +++++++++++++++++++++++++++++
 torch/package/package_exporter.py   | 28 ++++++++++++++++++++------
 2 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/test/package/test_dependency_api.py b/test/package/test_dependency_api.py
index bbd9c4401cfff..d9ff435de57d5 100644
--- a/test/package/test_dependency_api.py
+++ b/test/package/test_dependency_api.py
@@ -305,6 +305,37 @@ def test_invalid_import(self):
             ),
         )
 
+    @skipIf(version_info < (3, 7), "mock uses __getattr__ a 3.7 feature")
+    def test_repackage_mocked_module(self):
+        """Re-packaging a package that contains a mocked module should work correctly."""
+        buffer = BytesIO()
+        with PackageExporter(buffer) as exporter:
+            exporter.mock("package_a")
+            exporter.save_source_string("foo", "import package_a")
+
+        buffer.seek(0)
+        importer = PackageImporter(buffer)
+        foo = importer.import_module("foo")
+
+        # "package_a" should be mocked out.
+        with self.assertRaises(NotImplementedError):
+            foo.package_a.get_something()
+
+        # Re-package the model, but intern the previously-mocked module and mock
+        # everything else.
+        buffer2 = BytesIO()
+        with PackageExporter(buffer2, importer=importer) as exporter:
+            exporter.intern("package_a")
+            exporter.mock("**")
+            exporter.save_source_string("foo", "import package_a")
+
+        buffer2.seek(0)
+        importer2 = PackageImporter(buffer2)
+        foo2 = importer2.import_module("foo")
+
+        # "package_a" should still be mocked out.
+        with self.assertRaises(NotImplementedError):
+            foo2.package_a.get_something()
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 6c0ceaf924039..52d2985be09ad 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -48,6 +48,11 @@ class _ModuleProviderAction(Enum):
     EXTERN = 2
     MOCK = 3
     DENY = 4
+    # Special case: when a module is mocked, PackageExporter writes out a
+    # `_mock` module that implements our mocking stubs. If we re-package code,
+    # we may encounter a `_mock` module from the original package. If we do,
+    # just ignore it and write a `_mock` module once.
+    REPACKAGED_MOCK_MODULE = 5
 
 
 class PackagingErrorReason(Enum):
@@ -417,6 +422,14 @@ def add_dependency(self, module_name: str, dependencies=True):
         ):
             return
 
+        if module_name == "_mock":
+            self.dependency_graph.add_node(
+                module_name,
+                action=_ModuleProviderAction.REPACKAGED_MOCK_MODULE,
+                provided=True,
+            )
+            return
+
         if self._can_implicitly_extern(module_name):
             self.dependency_graph.add_node(
                 module_name, action=_ModuleProviderAction.EXTERN, provided=True
@@ -836,6 +849,11 @@ def _validate_dependency_graph(self):
                     f"Exporter did not match any modules to {pattern}, which was marked as allow_empty=False"
                 )
 
+    def _write_mock_file(self):
+        if "_mock.py" not in self._written_files:
+            mock_file = str(Path(__file__).parent / "_mock.py")
+            self._write_source_string("_mock", _read_file(mock_file), is_package=False)
+
     def _execute_dependency_graph(self):
         """Takes a finalized dependency graph describing how to package all
         modules and executes it, writing to the ZIP archive.
@@ -857,12 +875,7 @@ def _execute_dependency_graph(self):
                 for hook in self._mock_hooks.values():
                     hook(self, module_name)
 
-                if not _mock_written:
-                    mock_file = str(Path(__file__).parent / "_mock.py")
-                    self._write_source_string(
-                        "_mock", _read_file(mock_file), is_package=False
-                    )
-                    _mock_written = True
+                self._write_mock_file()
 
                 is_package = hasattr(self._import_module(module_name), "__path__")
                 self._write_source_string(module_name, _MOCK_IMPL, is_package)
@@ -886,6 +899,9 @@ def _execute_dependency_graph(self):
                 source = attrs["source"]
                 self._write_source_string(module_name, source, is_package)
 
+            elif action == _ModuleProviderAction.REPACKAGED_MOCK_MODULE:
+                self._write_mock_file()
+
             else:
                 raise AssertionError(
                     f"Invalid action: {module_name}, {action}. Please report a bug to PyTorch."

From 9e81d3d869ce82a3025777e02d272122722eebb5 Mon Sep 17 00:00:00 2001
From: Akshit Khurana <axit@fb.com>
Date: Fri, 9 Jul 2021 13:53:07 -0700
Subject: [PATCH 040/122] Make NNAPI linear converter accept flex inputs
 (#61022)

Summary:
As title

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61022

Test Plan: pytest test/test_nnapi.py::TestNNAPI::test_linear

Reviewed By: anshuljain1

Differential Revision: D29480749

fbshipit-source-id: 35975861740298c9e16f866c939e7ee3c2151710
---
 test/test_nnapi.py                  | 3 +++
 torch/backends/_nnapi/serializer.py | 8 ++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/test/test_nnapi.py b/test/test_nnapi.py
index e0d3ffbc4e09e..8b553fef578c1 100644
--- a/test/test_nnapi.py
+++ b/test/test_nnapi.py
@@ -422,6 +422,9 @@ def test_upsample_nearest2d(self):
     def test_linear(self):
         torch.manual_seed(29)
         self.check(torch.nn.Linear(16, 32), torch.randn(2, 16))
+        self.check(
+            torch.nn.Linear(16, 32), torch.randn(2, 16),
+            convert_args=[torch.zeros(0, 16)])
 
     def test_conv2d(self):
         cases = [
diff --git a/torch/backends/_nnapi/serializer.py b/torch/backends/_nnapi/serializer.py
index 249aa65bb164c..7b50839b2ddd4 100644
--- a/torch/backends/_nnapi/serializer.py
+++ b/torch/backends/_nnapi/serializer.py
@@ -1558,7 +1558,7 @@ def add_linear(self, node):
         self.add_addmm_or_linear(node, False, jit_input, jit_weight, jit_bias)
 
     def add_addmm_or_linear(self, node, transpose_weight, jit_input, jit_weight, jit_bias):
-        input_id, input_oper = self.get_tensor_operand_by_jitval_fixed_size(jit_input)
+        input_id, input_oper = self.get_tensor_operand_by_jitval(jit_input)
         bias_id, bias_oper = self.get_tensor_operand_for_weight(jit_bias)
 
         assert len(input_oper.shape) == 2
@@ -1575,6 +1575,10 @@ def add_addmm_or_linear(self, node, transpose_weight, jit_input, jit_weight, jit
         weight_oper = self.operands[weight_id]
 
         out_shape = (input_oper.shape[0], weight_oper.shape[0])
+        out_id = self.add_tensor_operand(node.outputsAt(0), input_oper._replace(shape=out_shape))
+
+        if input_oper.shape[0] == 0:
+            self.forward_operand_shape(out_id, 0, input_id, 0)
 
         inputs = [None] * 4
         inputs[0] = input_id
@@ -1583,7 +1587,7 @@ def add_addmm_or_linear(self, node, transpose_weight, jit_input, jit_weight, jit
         inputs[3] = self.add_immediate_int_scalar(NNAPI_FuseCode.FUSED_NONE)
 
         outputs = [None] * 1
-        outputs[0] = self.add_tensor_operand(node.outputsAt(0), input_oper._replace(shape=out_shape))
+        outputs[0] = out_id
 
         self.add_operation(NNAPI_OperationCode.FULLY_CONNECTED, inputs, outputs)
 

From 76c0f223d3ff430369a2864cf4e05464b1bfb688 Mon Sep 17 00:00:00 2001
From: Akshit Khurana <axit@fb.com>
Date: Fri, 9 Jul 2021 14:22:41 -0700
Subject: [PATCH 041/122] Make nnapi cat converter accept flex inputs

Summary: As title

Test Plan: pytest test/test_nnapi.py::TestNNAPI::test_cat

Reviewed By: anshuljain1

Differential Revision: D29480747

fbshipit-source-id: 161803054ff1a4c2c750fc30a5f0fc6d8a24b2c9
---
 test/test_nnapi.py                  | 11 +++++++++++
 torch/backends/_nnapi/serializer.py | 13 +++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/test/test_nnapi.py b/test/test_nnapi.py
index 8b553fef578c1..d6b94e7506e83 100644
--- a/test/test_nnapi.py
+++ b/test/test_nnapi.py
@@ -234,6 +234,17 @@ def forward(self, t1, t2):
                 nhwc(torch.randn(1, 4, 3, 3)),
             ])
 
+        self.check(
+            CatModule(1),
+            [
+                torch.randn(1, 2, 3, 3),
+                torch.randn(1, 4, 3, 3),
+            ],
+            convert_args=[
+                torch.zeros(0, 0, 0, 0),
+                torch.zeros(0, 0, 0, 0)
+            ])
+
     def test_pointwise_unary(self):
         for op in ["relu", "sigmoid"]:
             with self.subTest(op):
diff --git a/torch/backends/_nnapi/serializer.py b/torch/backends/_nnapi/serializer.py
index 7b50839b2ddd4..78499631be0b9 100644
--- a/torch/backends/_nnapi/serializer.py
+++ b/torch/backends/_nnapi/serializer.py
@@ -1064,7 +1064,7 @@ def add_cat(self, node):
         out_oper = None
         out_dim_size = 0
         for inp in tensors:
-            in_id, in_oper = self.get_tensor_operand_by_jitval_fixed_size(inp)
+            in_id, in_oper = self.get_tensor_operand_by_jitval(inp)
             if out_oper is None:
                 out_shape = change_element(in_oper.shape, dim, -1)
                 out_oper = in_oper._replace(shape=out_shape)
@@ -1085,10 +1085,19 @@ def add_cat(self, node):
         else:
             nnapi_dim = dim
 
+        out_id = self.add_tensor_operand(node.outputsAt(0), out_oper)
+        for idx, d in enumerate(out_oper.shape):
+            if d == 0:
+                if idx == dim:
+                    shape = " + ".join(flex_name(ip_id, dim) for ip_id in in_ids)
+                    self.compute_operand_shape(out_id, idx, shape)
+                else:
+                    self.forward_operand_shape(out_id, idx, in_ids[0], idx)
+
         inputs = in_ids + [self.add_immediate_int_scalar(nnapi_dim)]
 
         outputs = [None] * 1
-        outputs[0] = self.add_tensor_operand(node.outputsAt(0), out_oper)
+        outputs[0] = out_id
 
         self.add_operation(NNAPI_OperationCode.CONCATENATION, inputs, outputs)
 

From 1f4bba77b6f59e8ddd7f929f2662f9a1ffce94d5 Mon Sep 17 00:00:00 2001
From: Bradley Davis <bradleyhd@fb.com>
Date: Fri, 9 Jul 2021 14:50:38 -0700
Subject: [PATCH 042/122] [fx] fix subgraph API call_module warning about no
 owning module (#61463)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61463

seems like a small oversight(?), current test fails when warnings are recorded. discovered this when calling `graph.call_module(existing_call_module_node.target)` and it raised a warning

Test Plan: `buck test //caffe2/test:fx`

Reviewed By: ansley

Differential Revision: D29637799

fbshipit-source-id: 2305629863230235f76a926fe2e4de480cbf853c
---
 test/test_fx.py   | 6 ++++--
 torch/fx/graph.py | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index 54bb843b9d87c..eaa3d867e8b41 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -2255,8 +2255,10 @@ def forward(self, x):
 
         conv = [n for n in a.graph.nodes if n.target == "net_b.net_c.conv"][-1]
         with a.graph.inserting_before(conv):
-            dropout = a.graph.call_module(module_name="net_b.net_c.dropout",
-                                          args=conv.args)
+            with warnings.catch_warnings(record=True) as w:
+                dropout = a.graph.call_module(module_name="net_b.net_c.dropout",
+                                              args=conv.args)
+                self.assertEqual(len(w), 0)
 
         conv.replace_all_uses_with(dropout)
         a.graph.erase_node(conv)
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 5594b43d0bf7c..f89ac425dcd8f 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -603,7 +603,7 @@ def call_module(self,
             as :meth:`Graph.create_node`.
         """
         if (self.owning_module and
-                self.owning_module.get_submodule(module_name) is not None):
+                self.owning_module.get_submodule(module_name) is None):
             warnings.warn("Attempted to insert a call_module Node with "
                           "no underlying reference in the owning "
                           "GraphModule! Call "

From 028e438d6c2de839b269f5775d128897c05a419d Mon Sep 17 00:00:00 2001
From: Aliaksandr Ivanou <aivanou@fb.com>
Date: Fri, 9 Jul 2021 14:51:40 -0700
Subject: [PATCH 043/122] [torchelastic] Make sure `rdzv_configs[timeout]` is
 not getting overwritten (#61471)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61471

Make sure `rdzv_configs[timeout]` is not getting overwritten

Test Plan: sandcastle

Differential Revision: D29638606

fbshipit-source-id: e164cdddaed77e7e35412ed58ac1ee312e9d489d
---
 torch/distributed/launcher/api.py | 73 +++++++++++++++++++------------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
index 1ee5abd649692..66d5b517ba3b1 100644
--- a/torch/distributed/launcher/api.py
+++ b/torch/distributed/launcher/api.py
@@ -27,33 +27,44 @@
 @dataclass
 class LaunchConfig:
     """
-    min_nodes: Minimum amount of nodes that the user function will
-                     be launched on. Elastic agent ensures that the user
-                     function start only when the min_nodes amount enters
-                     the rendezvous.
-    max_nodes: Maximum amount of nodes that the user function
-                     will be launched on.
-    nproc_per_node: On each node the elastic agent will launch
-                          this amount of workers that will execute user
-                          defined function.
-    rdzv_backend: rdzv_backend to use in the rendezvous (zeus-adapter, etcd).
-    rdzv_endpoint: The endpoint of the rdzv sync. storage.
-    rdzv_id: The unique run id of the job (if not passed a unique one will be
-             deduced from run environment - flow workflow id in flow - or auto generated).
-    role: User defined role of the worker (defaults to "trainer").
-    max_restarts: The maximum amount of restarts that elastic agent will conduct
-                  on workers before failure.
-    monitor_interval: The interval in seconds that is used by the elastic_agent
-                      as a period of monitoring workers.
-    start_method: The method is used by the elastic agent to start the
-                  workers (spawn, fork, forkserver).
-    log_dir: base log directory where log files are written. If not set,
-             one is created in a tmp dir but NOT removed on exit.
-    redirects: configuration to redirect stdout/stderr to log files.
-               Pass a single ``Std`` enum to redirect all workers,
-               or a mapping keyed by local_rank to selectively redirect.
-    tee: configuration to "tee" stdout/stderr to console + log file.
-    metrics_cfg: configuration to initialize metrics.
+    Creates a rendezvous config.
+
+    Args:
+        min_nodes: Minimum amount of nodes that the user function will
+                        be launched on. Elastic agent ensures that the user
+                        function start only when the min_nodes amount enters
+                        the rendezvous.
+        max_nodes: Maximum amount of nodes that the user function
+                        will be launched on.
+        nproc_per_node: On each node the elastic agent will launch
+                            this amount of workers that will execute user
+                            defined function.
+        rdzv_backend: rdzv_backend to use in the rendezvous (zeus-adapter, etcd).
+        rdzv_endpoint: The endpoint of the rdzv sync. storage.
+        rdzv_configs: Key, value pair that specifies rendezvous specific configuration.
+        rdzv_timeout: Legacy argument that specifies timeout for the rendezvous. It is going
+            to be removed in future versions, see the note below. The default timeout is 900 seconds.
+        rdzv_id: The unique run id of the job (if not passed a unique one will be
+                deduced from run environment - flow workflow id in flow - or auto generated).
+        role: User defined role of the worker (defaults to "trainer").
+        max_restarts: The maximum amount of restarts that elastic agent will conduct
+                    on workers before failure.
+        monitor_interval: The interval in seconds that is used by the elastic_agent
+                        as a period of monitoring workers.
+        start_method: The method is used by the elastic agent to start the
+                    workers (spawn, fork, forkserver).
+        log_dir: base log directory where log files are written. If not set,
+                one is created in a tmp dir but NOT removed on exit.
+        redirects: configuration to redirect stdout/stderr to log files.
+                Pass a single ``Std`` enum to redirect all workers,
+                or a mapping keyed by local_rank to selectively redirect.
+        tee: configuration to "tee" stdout/stderr to console + log file.
+        metrics_cfg: configuration to initialize metrics.
+
+    ..note:
+        `rdzv_timeout` is a legacy argument that will be removed in future.
+        Set the timeout via `rdzv_configs['timeout']`
+
     """
 
     min_nodes: int
@@ -64,7 +75,7 @@ class LaunchConfig:
     rdzv_endpoint: str = ""
     rdzv_backend: str = "etcd"
     rdzv_configs: Dict[str, Any] = field(default_factory=dict)
-    rdzv_timeout: int = 900
+    rdzv_timeout: int = -1
     max_restarts: int = 3
     monitor_interval: float = 30
     start_method: str = "spawn"
@@ -74,7 +85,11 @@ class LaunchConfig:
     metrics_cfg: Dict[str, str] = field(default_factory=dict)
 
     def __post_init__(self):
-        self.rdzv_configs["timeout"] = self.rdzv_timeout
+        default_timeout = 900
+        if self.rdzv_timeout != -1:
+            self.rdzv_configs["timeout"] = self.rdzv_timeout
+        elif "timeout" not in self.rdzv_configs:
+            self.rdzv_configs["timeout"] = default_timeout
 
 
 class elastic_launch:

From ae65f63971c868e43d02e958523bd42121bfc73c Mon Sep 17 00:00:00 2001
From: Akshit Khurana <axit@fb.com>
Date: Fri, 9 Jul 2021 15:08:54 -0700
Subject: [PATCH 044/122] Make nnapi flatten converter accept flex inputs
 (#61024)

Summary:
As title

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61024

Test Plan: pytest test/test_nnapi.py::TestNNAPI::test_flatten

Reviewed By: anshuljain1

Differential Revision: D29480748

fbshipit-source-id: c334b09600a64d3e552cec843d6da3de28e7d27c
---
 test/test_nnapi.py                  | 23 +++++++++++------------
 torch/backends/_nnapi/serializer.py | 27 +++++++++++++++------------
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/test/test_nnapi.py b/test/test_nnapi.py
index d6b94e7506e83..bd990033594a7 100644
--- a/test/test_nnapi.py
+++ b/test/test_nnapi.py
@@ -154,18 +154,17 @@ def test_flatten(self):
         ]:
             self.check(mod, torch.randn(4, 2, 1, 3, 7))
 
-        # TODO(axit): To add support for runtime
-        # self.check(
-        #     torch.nn.Flatten(),
-        #     torch.randn(4, 2, 1, 3, 7),
-        #     convert_args=[torch.zeros(0, 2, 1, 3, 7)]
-        # )
-        # with self.assertRaisesRegex(Exception, "dims can't be flexible"):
-        #     self.check(torch.nn.Flatten(), torch.randn(4, 2, 0, 0, 7))
-        # with self.assertRaisesRegex(Exception, "Only 1 dim"):
-        #     self.check(
-        #         torch.nn.Flatten(start_dim=1, end_dim=-2),
-        #         torch.randn(0, 2, 1, 3, 0))
+        self.check(
+            torch.nn.Flatten(),
+            torch.randn(4, 2, 1, 3, 7),
+            convert_args=[torch.zeros(0, 2, 1, 3, 7)]
+        )
+        with self.assertRaisesRegex(Exception, "dims can't be flexible"):
+            self.check(torch.nn.Flatten(), torch.randn(4, 2, 0, 0, 7))
+        with self.assertRaisesRegex(Exception, "Only 1 dim"):
+            self.check(
+                torch.nn.Flatten(start_dim=1, end_dim=-2),
+                torch.randn(0, 2, 1, 3, 0))
 
     def test_slice(self):
         class SliceModule(torch.nn.Module):
diff --git a/torch/backends/_nnapi/serializer.py b/torch/backends/_nnapi/serializer.py
index 78499631be0b9..751d68ec89a5c 100644
--- a/torch/backends/_nnapi/serializer.py
+++ b/torch/backends/_nnapi/serializer.py
@@ -935,7 +935,7 @@ def add_flatten(self, node):
         assert node.inputsSize() == 3
         assert node.outputsSize() == 1
 
-        in_id, in_oper = self.get_tensor_operand_by_jitval_fixed_size(node.inputsAt(0))
+        in_id, in_oper = self.get_tensor_operand_by_jitval(node.inputsAt(0))
 
         start_ctype, start_dim = self.get_constant_value(node.inputsAt(1), "IntType")
         end_ctype, end_dim = self.get_constant_value(node.inputsAt(2), "IntType")
@@ -956,23 +956,26 @@ def add_flatten(self, node):
             in_oper.shape[end_dim + 1:]
         )
 
-        # TODO(axit): To add support for runtime
-        # if any(dim == 0 for dim in in_oper.shape[start_dim: end_dim + 1]):
-        #     raise Exception("Flattened dims can't be flexible")
-        # non_flattened_dims = in_oper.shape[: start_dim] + in_oper.shape[end_dim + 1:]
-        # if non_flattened_dims.count(0) > 1:
-        #     raise Exception("Only 1 dim can be flexible")
-        # out_shape = tuple(
-        #     dim if dim != 0 else -1
-        #     for dim in out_shape
-        # )
+        if any(dim == 0 for dim in in_oper.shape[start_dim: end_dim + 1]):
+            raise Exception("Flattening flexible dims is not supported yet")
+        non_flattened_dims = in_oper.shape[: start_dim] + in_oper.shape[end_dim + 1:]
+        if non_flattened_dims.count(0) > 1:
+            raise Exception("Only 1 dim can be flexible")
 
         out_oper = in_oper._replace(shape=out_shape)
         out_id = self.add_tensor_operand(node.outputsAt(0), out_oper)
 
+        for idx, dim in enumerate(out_shape):
+            if dim == 0:
+                self.forward_operand_shape(out_id, idx, in_id, in_oper.shape.index(0))
+
+        inputs_1 = tuple(
+            dim if dim != 0 else -1
+            for dim in out_shape
+        )
         inputs = [None] * 2
         inputs[0] = in_id
-        inputs[1] = self.add_immediate_int_vector(out_shape)
+        inputs[1] = self.add_immediate_int_vector(inputs_1)
 
         outputs = [None] * 1
         outputs[0] = out_id

From b5f0576278433adb8d1745b47d3b0f811a4c4242 Mon Sep 17 00:00:00 2001
From: Lily Johnson <lillianjohnson@fb.com>
Date: Fri, 9 Jul 2021 15:09:45 -0700
Subject: [PATCH 045/122] [package] Modify Digraph to track predecessors
 (#61146)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61146

Track predecessors of nodes in DiGraph in order to enable cleaner dependency visualization code.

Test Plan: Imported from OSS

Reviewed By: suo

Differential Revision: D29559682

Pulled By: Lilyjjo

fbshipit-source-id: 06f51b1108423aece5bdd72a7b82ab736e5e4f94
---
 test/package/test_digraph.py | 15 +++++++++++++++
 torch/package/_digraph.py    | 13 +++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/test/package/test_digraph.py b/test/package/test_digraph.py
index aa4e688c0a64e..6b3d8a590a151 100644
--- a/test/package/test_digraph.py
+++ b/test/package/test_digraph.py
@@ -21,11 +21,26 @@ def test_successors(self):
         self.assertIn("baz", list(g.successors("foo")))
         self.assertEqual(len(list(g.successors("qux"))), 0)
 
+    def test_predecessors(self):
+        g = DiGraph()
+        g.add_edge("foo", "bar")
+        g.add_edge("foo", "baz")
+        g.add_node("qux")
+
+        self.assertIn("foo", list(g.predecessors("bar")))
+        self.assertIn("foo", list(g.predecessors("baz")))
+        self.assertEqual(len(list(g.predecessors("qux"))), 0)
+
     def test_successor_not_in_graph(self):
         g = DiGraph()
         with self.assertRaises(ValueError):
             g.successors("not in graph")
 
+    def test_predecessor_not_in_graph(self):
+        g = DiGraph()
+        with self.assertRaises(ValueError):
+            g.predecessors("not in graph")
+
     def test_node_attrs(self):
         g = DiGraph()
         g.add_node("foo", my_attr=1, other_attr=2)
diff --git a/torch/package/_digraph.py b/torch/package/_digraph.py
index 19e3a0d0050ae..b63f84311cfef 100644
--- a/torch/package/_digraph.py
+++ b/torch/package/_digraph.py
@@ -11,6 +11,8 @@ def __init__(self):
         # Nested dict of node -> successor node -> nothing.
         # (didn't implement edge data)
         self._succ = {}
+        # Nested dict of node -> predecessor node -> nothing.
+        self._pred = {}
 
     def add_node(self, n, **kwargs):
         """Add a node to the graph.
@@ -22,6 +24,7 @@ def add_node(self, n, **kwargs):
         if n not in self._node:
             self._node[n] = kwargs
             self._succ[n] = {}
+            self._pred[n] = {}
         else:
             self._node[n].update(kwargs)
 
@@ -34,12 +37,15 @@ def add_edge(self, u, v):
         if u not in self._node:
             self._node[u] = {}
             self._succ[u] = {}
+            self._pred[u] = {}
         if v not in self._node:
             self._node[v] = {}
             self._succ[v] = {}
+            self._pred[v] = {}
 
         # add the edge
         self._succ[u][v] = True
+        self._pred[v][u] = True
 
     def successors(self, n):
         """Returns an iterator over successor nodes of n."""
@@ -48,6 +54,13 @@ def successors(self, n):
         except KeyError as e:
             raise ValueError(f"The node {n} is not in the digraph.") from e
 
+    def predecessors(self, n):
+        """Returns an iterator over predecessors nodes of n."""
+        try:
+            return iter(self._pred[n])
+        except KeyError as e:
+            raise ValueError(f"The node {n} is not in the digraph.") from e
+
     @property
     def edges(self):
         """Returns an iterator over all edges (u, v) in the graph"""

From 12772c8dd80f5f6d1ea3bffbafaaccdb21b8296d Mon Sep 17 00:00:00 2001
From: Lily Johnson <lillianjohnson@fb.com>
Date: Fri, 9 Jul 2021 15:09:45 -0700
Subject: [PATCH 046/122] [package] PackageExporter visualization methods
 (#61147)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61147

Basic tooling to enable users to see what is inside of a PackageExporter. Added methods:
- `externed/interned/mocked/denied_list()`: returns list of modules which are currently in the specified category
- `relied_on_by(module_name)`: returns list of modules which rely on `module_name`
- `dependency_graph_str()`: returns string format of graph for users. Example of output:
```
digraph G {
rankdir = LR;
node [shape=box];
"<res.foo.pkl>" -> "foo";
"foo" -> "torch.package";
"foo" -> "time";
"foo" -> "sentencepiece";
"foo" -> "package_top";
}
```

Test Plan: Imported from OSS

Reviewed By: suo

Differential Revision: D29559683

Pulled By: Lilyjjo

fbshipit-source-id: 5dff4d04af911a9c9fdd0d100420f1382eaef46e
---
 test/package/test_misc.py         | 29 ++++++++++++
 torch/package/package_exporter.py | 73 +++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/test/package/test_misc.py b/test/package/test_misc.py
index ae8004f091d96..1eb2feff9b604 100644
--- a/test/package/test_misc.py
+++ b/test/package/test_misc.py
@@ -4,6 +4,7 @@
 from textwrap import dedent
 
 from torch.package import PackageExporter, PackageImporter, is_from_package
+from torch.package.package_exporter import PackagingError
 from torch.testing._internal.common_utils import run_tests
 
 try:
@@ -115,6 +116,34 @@ def test_file_structure_has_file(self):
         self.assertTrue(file_structure.has_file("package_a/subpackage.py"))
         self.assertFalse(file_structure.has_file("package_a/subpackage"))
 
+    def test_exporter_content_lists(self):
+        """
+        Test content list API for PackageExporter's contained modules.
+        """
+
+        with PackageExporter(BytesIO()) as he:
+            import package_b
+
+            he.extern("package_b.subpackage_1")
+            he.mock("package_b.subpackage_2")
+            he.intern("**")
+            he.save_pickle("obj", "obj.pkl", package_b.PackageBObject(["a"]))
+            self.assertEqual(he.externed_modules(), ["package_b.subpackage_1"])
+            self.assertEqual(he.mocked_modules(), ["package_b.subpackage_2"])
+            self.assertEqual(
+                he.interned_modules(),
+                ["package_b", "package_b.subpackage_0.subsubpackage_0"],
+            )
+            self.assertEqual(he.get_rdeps("package_b.subpackage_2"), ["package_b"])
+
+        with self.assertRaises(PackagingError) as e:
+            with PackageExporter(BytesIO()) as he:
+                import package_b
+
+                he.deny("package_b")
+                he.save_pickle("obj", "obj.pkl", package_b.PackageBObject(["a"]))
+                self.assertEqual(he.denied_modules(), ["package_b"])
+
     def test_is_from_package(self):
         """is_from_package should work for objects and modules"""
         import package_a.subpackage
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 52d2985be09ad..0a88b75245ef3 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -940,6 +940,79 @@ def _can_implicitly_extern(self, module_name: str):
             and is_stdlib_module(top_level_package_name)
         )
 
+    def dependency_graph_string(self) -> str:
+        """Returns digraph string representation of dependencies in package.
+
+        Returns:
+            A string representation of dependencies in package.
+        """
+        edges = "\n".join(f'"{f}" -> "{t}";' for f, t in self.dependency_graph.edges)
+        return f"""\
+digraph G {{
+rankdir = LR;
+node [shape=box];
+{edges}
+}}
+"""
+
+    def _nodes_with_action_type(
+        self, action: Optional[_ModuleProviderAction]
+    ) -> List[str]:
+        result = []
+        for name, node_dict in self.dependency_graph.nodes.items():
+            node_action = node_dict.get("action", None)
+            if node_action == action and "is_pickle" not in node_dict:
+                result.append(name)
+        result.sort()
+        return result
+
+    def externed_modules(self) -> List[str]:
+        """Return all modules that are currently externed.
+
+        Returns:
+            A list containing the names of modules which will be
+            externed in this package.
+        """
+        return self._nodes_with_action_type(_ModuleProviderAction.EXTERN)
+
+    def interned_modules(self) -> List[str]:
+        """Return all modules that are currently interned.
+
+        Returns:
+            A list containing the names of modules which will be
+            interned in this package.
+        """
+        return self._nodes_with_action_type(_ModuleProviderAction.INTERN)
+
+    def mocked_modules(self) -> List[str]:
+        """Return all modules that are currently mocked.
+
+        Returns:
+            A list containing the names of modules which will be
+            mocked in this package.
+        """
+        return self._nodes_with_action_type(_ModuleProviderAction.MOCK)
+
+    def denied_modules(self) -> List[str]:
+        """Return all modules that are currently denied.
+
+        Returns:
+            A list containing the names of modules which will be
+            denied in this package.
+        """
+        return self._nodes_with_action_type(_ModuleProviderAction.DENY)
+
+    def get_rdeps(self, module_name: str) -> List[str]:
+        """Return a list of all modules which depend on the module ``module_name``.
+
+        Returns:
+            A list containing the names of modules which depend on ``module_name``.
+        """
+        if module_name in self.dependency_graph._pred.keys():
+            return list(self.dependency_graph._pred[module_name].keys())
+        else:
+            return []
+
 
 # even though these are in the standard library, we do not allow them to be
 # automatically externed since they offer a lot of system level access

From cbb6ab6d88a2d39a0416f83f35a96dd07b4d651e Mon Sep 17 00:00:00 2001
From: Lily Johnson <lillianjohnson@fb.com>
Date: Fri, 9 Jul 2021 15:09:45 -0700
Subject: [PATCH 047/122] [package] ignore dunder import errors (#61148)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61148

Changes `__import__` processing to silently skip cases where the `__import__` statement cannot be parsed. Adds failed imports to a list retrievable by `PackageExporter.failed_dunder_import_list()`.

Test Plan: Imported from OSS

Reviewed By: suo

Differential Revision: D29559680

Pulled By: Lilyjjo

fbshipit-source-id: 2513d0b9ef271c85cadc3f5a013fbd8c8de80b46
---
 test/package/test_save_load.py          |  8 +++
 torch/package/find_file_dependencies.py | 79 ++++++++++++-------------
 2 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/test/package/test_save_load.py b/test/package/test_save_load.py
index 1798c02270d4b..20601783ce40e 100644
--- a/test/package/test_save_load.py
+++ b/test/package/test_save_load.py
@@ -108,6 +108,14 @@ def test_dunder_imports(self):
         subsubpackage_0 = hi.import_module("package_b.subpackage_0.subsubpackage_0")
         self.assertEqual(subsubpackage_0.result, "subsubpackage_0")
 
+    def test_bad_dunder_imports(self):
+        """Test to ensure bad __imports__ don't cause PackageExporter to fail."""
+        buffer = BytesIO()
+        with PackageExporter(buffer) as e:
+            e.save_source_string(
+                "m", '__import__(these, unresolvable, "things", wont, crash, me)'
+            )
+
     def test_save_module_binary(self):
         f = BytesIO()
         with PackageExporter(f) as he:
diff --git a/torch/package/find_file_dependencies.py b/torch/package/find_file_dependencies.py
index 7ade2d1f1a82c..cc16c339ea34e 100644
--- a/torch/package/find_file_dependencies.py
+++ b/torch/package/find_file_dependencies.py
@@ -57,47 +57,46 @@ def _grab_node_str(self, node):
     def visit_Call(self, node):
         # __import__ calls aren't routed to the visit_Import/From nodes
         if hasattr(node.func, "id") and node.func.id == "__import__":
-            if type(node.args[0]) not in [ast.Constant, ast.Str]:
-                # We don't want to parse dynamic uses of __import__
+            try:
+                name = self._grab_node_str(node.args[0])
+                fromlist = []
+                level = 0
+                if len(node.args) > 3:
+                    for v in node.args[3].elts:
+                        fromlist.append(self._grab_node_str(v))
+                elif hasattr(node, "keywords"):
+                    for keyword in node.keywords:
+                        if keyword.arg == "fromlist":
+                            for v in keyword.value.elts:
+                                fromlist.append(self._grab_node_str(v))
+                if len(node.args) > 4:
+                    level = self._grab_node_int(node.args[4])
+                elif hasattr(node, "keywords"):
+                    for keyword in node.keywords:
+                        if keyword.arg == "level":
+                            level = self._grab_node_int(keyword.value)
+                if fromlist == []:
+                    # the top-level package (the name up till the first dot) is returned
+                    # when the fromlist argument is empty in normal import system,
+                    # we need to include top level package to match this behavior and last
+                    # level package to capture the intended dependency of user
+                    self.references[(name, None)] = True
+                    top_name = name.rsplit(".", maxsplit=1)[0]
+                    if top_name != name:
+                        top_name = self._absmodule(top_name, level)
+                        self.references[(top_name, None)] = True
+                else:
+                    name = self._absmodule(name, level)
+                    for alias in fromlist:
+                        # fromlist args may be submodules, so we have to add the fromlist args
+                        # to the list of potential references. If import of an arg fails we
+                        # will ignore it, similar to visit_ImportFrom
+                        if alias != "*":
+                            self.references[(name, alias)] = True
+                        else:
+                            self.references[(name, None)] = True
+            except Exception as e:
                 return
 
-            name = self._grab_node_str(node.args[0])
-            fromlist = []
-            level = 0
-            if len(node.args) > 3:
-                for v in node.args[3].elts:
-                    fromlist.append(self._grab_node_str(v))
-            elif hasattr(node, "keywords"):
-                for keyword in node.keywords:
-                    if keyword.arg == "fromlist":
-                        for v in keyword.value.elts:
-                            fromlist.append(self._grab_node_str(v))
-            if len(node.args) > 4:
-                level = self._grab_node_int(node.args[4])
-            elif hasattr(node, "keywords"):
-                for keyword in node.keywords:
-                    if keyword.arg == "level":
-                        level = self._grab_node_int(keyword.value)
-            if fromlist == []:
-                # the top-level package (the name up till the first dot) is returned
-                # when the fromlist argument is empty in normal import system,
-                # we need to include top level package to match this behavior and last
-                # level package to capture the intended dependency of user
-                self.references[(name, None)] = True
-                top_name = name.rsplit(".", maxsplit=1)[0]
-                if top_name != name:
-                    top_name = self._absmodule(top_name, level)
-                    self.references[(top_name, None)] = True
-            else:
-                name = self._absmodule(name, level)
-                for alias in fromlist:
-                    # fromlist args may be submodules, so we have to add the fromlist args
-                    # to the list of potential references. If import of an arg fails we
-                    # will ignore it, similar to visit_ImportFrom
-                    if alias != "*":
-                        self.references[(name, alias)] = True
-                    else:
-                        self.references[(name, None)] = True
-
 
 find_files_source_depends_on = _ExtractModuleReferences.run

From a3670ba377c600b997674ca008019a70fa75321c Mon Sep 17 00:00:00 2001
From: Akshit Khurana <axit@fb.com>
Date: Fri, 9 Jul 2021 15:21:35 -0700
Subject: [PATCH 048/122] Add option to specify custom NNAPI serializer
 (#61025)

Summary:
To add serializer for custom ops we can subclass default serializer
and update ADDER_MAP

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61025

Test Plan:
* pytest test/test_nnapi.py::TestNNAPI for current serializer
* Custom serializers to be tested with custom ops

Imported from OSS

Reviewed By: anshuljain1

Differential Revision: D29480745

fbshipit-source-id: 37e3f8de3c97f6c8a486f9879ce11430ea89af34
---
 torch/backends/_nnapi/prepare.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/torch/backends/_nnapi/prepare.py b/torch/backends/_nnapi/prepare.py
index 83b5a3b347c00..724d2c20b2e9c 100644
--- a/torch/backends/_nnapi/prepare.py
+++ b/torch/backends/_nnapi/prepare.py
@@ -1,7 +1,8 @@
 from typing import Optional, List
 
 import torch
-from torch.backends._nnapi.serializer import serialize_model
+from torch.backends._nnapi.serializer import _NnapiSerializer
+
 
 class NnapiModule(torch.nn.Module):
     """Torch Module that wraps an NNAPI Compilation.
@@ -75,14 +76,15 @@ def forward(self, args: List[torch.Tensor]) -> List[torch.Tensor]:
                 raise Exception("Invalid mem_fmt")
         return outs
 
-
-def convert_model_to_nnapi(model, inputs):
+def convert_model_to_nnapi(model, inputs, serializer=None):
     model = torch.jit.freeze(model)
 
     if isinstance(inputs, torch.Tensor):
         inputs = [inputs]
 
-    ser_model, used_weights, inp_mem_fmts, out_mem_fmts, shape_compute_lines, retval_count = serialize_model(model, inputs)
+    serializer = serializer or _NnapiSerializer(config=None)
+    (ser_model, used_weights, inp_mem_fmts, out_mem_fmts, shape_compute_lines,
+     retval_count) = serializer.serialize_model(model, inputs)
     ser_model_tensor = torch.tensor(ser_model, dtype=torch.int32)
 
     # We have to create a new class here every time this function is called

From 54ea7d33ba39a4a9ecd86cfb4439a49e4143b783 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Fri, 9 Jul 2021 15:28:44 -0700
Subject: [PATCH 049/122] [package] error if we try to mock a module in 3.6
 (#61469)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61469

This feature is not supported, error out early.

Differential Revision:
D29639797
D29639797

Test Plan: Imported from OSS

Reviewed By: Lilyjjo

Pulled By: suo

fbshipit-source-id: 775ed78638fb6da8f830b632726b00c0533ed176
---
 test/package/test_dependency_api.py | 9 +++++++++
 torch/package/package_exporter.py   | 6 ++++++
 2 files changed, 15 insertions(+)

diff --git a/test/package/test_dependency_api.py b/test/package/test_dependency_api.py
index d9ff435de57d5..a42279c807df5 100644
--- a/test/package/test_dependency_api.py
+++ b/test/package/test_dependency_api.py
@@ -130,6 +130,15 @@ def test_mock(self):
         with self.assertRaisesRegex(NotImplementedError, "was mocked out"):
             r()
 
+    @skipIf(version_info > (3, 6), "tests specific 3.6 behavior")
+    def test_mock_36_error(self):
+        """Test that an error is properly thrown when we attempt to mock a
+        module in Python 3.6.
+        """
+        with self.assertRaisesRegex(RuntimeError, "upgrade your Python"):
+            with PackageExporter(BytesIO()) as exporter:
+                exporter.mock(["package_a"])
+
     @skipIf(version_info < (3, 7), "mock uses __getattr__ a 3.7 feature")
     def test_mock_glob(self):
         buffer = BytesIO()
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 0a88b75245ef3..4f047fedce048 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -8,6 +8,7 @@
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
+from sys import version_info
 from typing import (
     Any,
     BinaryIO,
@@ -719,6 +720,11 @@ def mock(
                 If ``allow_empty=True``, no such exception is thrown.
 
         """
+        if version_info < (3, 7):
+            raise RuntimeError(
+                "Python 3.7 or higher is required to mock out modules. "
+                "Please upgrade your Python version."
+            )
         self.patterns[GlobGroup(include, exclude=exclude)] = _PatternInfo(
             _ModuleProviderAction.MOCK, allow_empty
         )

From dec5aa2260cef540b622bd9a9504b6f11cb1f607 Mon Sep 17 00:00:00 2001
From: Gary Miguel <garymiguel@microsoft.com>
Date: Fri, 9 Jul 2021 16:13:27 -0700
Subject: [PATCH 050/122] [JIT] clean up (#60390)

Summary:
* Minor: spelling, grammar.
* Add calls to `GRAPH_DUMP()` where they were missing.
* Add or expand a few comments.
* Move a few comments to seemingly more appropriate spots.
* In canonicalize_graph_fuser_ops.cpp inline `runnableInputs()` since it
  was only called in one place and had a misleading comment and
  confusing name.
* In `PeepholeOptimizeImpl::optimizeBlock()`, set `changed = true;` when
  removing `aten::is_complex`. Pretty sure its absence was a bug.
* Delete unused `_jit_pass_remove_inplace_ops` and and its
  implementation `RemoveInplaceOps()`.
* In `preprocessCaffe2Ops()`, remove redundant check for nested optional
  types. It was already checked in `checkONNXCompatibility()`.
* In `EncoderBase::AddAttribute`, log the unexpected attribute kind.
  I don't remember the repro case now but I did hit this error at some
  point and this additional logging made it easier to understand.
* In `fuseConvBatchNorm()` in eval_peephole.cpp, consistently use
  camelCase instead of snake_case for local variables.
* Add curly braces around the bodies of if and loops.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/60390

Reviewed By: Krovatkin

Differential Revision: D29523283

Pulled By: SplitInfinity

fbshipit-source-id: 4e16c5648616f53da07d68dab7fdf252e06a0752
---
 test/test_jit.py                              |   5 +-
 torch/_C/__init__.pyi.in                      |   1 -
 torch/csrc/jit/jit_log.cpp                    |   2 +
 torch/csrc/jit/jit_log.h                      |   1 +
 .../passes/canonicalize_graph_fuser_ops.cpp   |   5 +
 .../csrc/jit/passes/constant_propagation.cpp  |  26 ++--
 torch/csrc/jit/passes/erase_number_types.cpp  |   2 +
 torch/csrc/jit/passes/inline_fork_wait.cpp    |   2 +
 torch/csrc/jit/passes/lower_tuples.cpp        |   3 +
 torch/csrc/jit/passes/onnx.cpp                |  17 ++-
 torch/csrc/jit/passes/onnx/constant_fold.cpp  |  21 ++--
 torch/csrc/jit/passes/onnx/constant_fold.h    |   4 +-
 torch/csrc/jit/passes/onnx/eval_peephole.cpp  | 112 ++++++++++--------
 torch/csrc/jit/passes/onnx/eval_peephole.h    |   6 +-
 torch/csrc/jit/passes/onnx/peephole.cpp       |  81 ++++++++-----
 .../passes/onnx/prepare_division_for_onnx.cpp |   4 +
 .../jit/passes/onnx/preprocess_for_onnx.cpp   |  18 +--
 .../onnx/remove_inplace_ops_for_onnx.cpp      |   5 +-
 .../jit/passes/onnx/scalar_type_analysis.cpp  |   7 +-
 .../jit/passes/onnx/shape_type_inference.cpp  |  13 +-
 .../passes/onnx/unpack_quantized_weights.cpp  |   3 +
 torch/csrc/jit/passes/peephole.cpp            |  71 ++++++-----
 .../jit/passes/peephole_alias_sensitive.cpp   |   6 +-
 torch/csrc/jit/passes/remove_inplace_ops.cpp  | 111 +++--------------
 torch/csrc/jit/passes/remove_inplace_ops.h    |   2 -
 torch/csrc/jit/passes/remove_mutation.cpp     |   4 +-
 torch/csrc/jit/python/init.cpp                |  10 +-
 torch/csrc/jit/python/python_arg_flatten.cpp  |   6 +-
 torch/csrc/jit/serialization/export.cpp       |  28 ++---
 29 files changed, 274 insertions(+), 302 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index eeeb391522778..3bfe6cf841957 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -10900,9 +10900,8 @@ def func(a):
 
         graph = torch.jit.script(func).graph
         FileCheck().check("int = prim::Constant").check("aten::add_").run(str(graph))
-        self.run_pass('remove_inplace_ops', graph)
-        self.run_pass('erase_number_types', graph)
-        FileCheck().check_not("int = prim::Constant").check_not("aten::add_").run(str(graph))
+        self.run_pass("erase_number_types", graph)
+        FileCheck().check_not("int = prim::Constant").run(str(graph))
 
     def test_remove_dropout(self):
         weight_0_shape = (20, 5)
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 8cc5f8fd1d7e5..b2dc5b989f1bf 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -295,7 +295,6 @@ def _jit_pass_onnx_set_dynamic_input_shape(graph: Graph, dynamic_axes: Dict[str,
 def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> None: ...
 def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], desc: IODescriptor, onnx_shape_inference: _bool = False) -> None: ...
 def _jit_pass_onnx_remove_inplace_ops_for_onnx(graph: Graph, module: Module) -> None: ...
-def _jit_pass_remove_inplace_ops(graph: Graph) -> None: ...
 def _jit_pass_canonicalize_graph_fuser_ops(graph: Graph) -> None: ...
 def _jit_pass_peephole(graph: Graph, addmm_fusion_enabled: _bool) -> None: ...
 def _jit_pass_fuse_addmm(graph: Graph) -> None: ...
diff --git a/torch/csrc/jit/jit_log.cpp b/torch/csrc/jit/jit_log.cpp
index 9165ceb24209c..9ef170f96c5c3 100644
--- a/torch/csrc/jit/jit_log.cpp
+++ b/torch/csrc/jit/jit_log.cpp
@@ -2,6 +2,8 @@
 #include <cstdlib>
 #include <iomanip>
 #include <sstream>
+#include <string>
+#include <unordered_map>
 #include <vector>
 
 #include <ATen/core/function.h>
diff --git a/torch/csrc/jit/jit_log.h b/torch/csrc/jit/jit_log.h
index 3b3460b38471c..7e80ef6aa7750 100644
--- a/torch/csrc/jit/jit_log.h
+++ b/torch/csrc/jit/jit_log.h
@@ -2,6 +2,7 @@
 #include <torch/csrc/WindowsTorchApiMacro.h>
 #include <memory>
 #include <string>
+#include <unordered_map>
 
 // `TorchScript` offers a simple logging facility that can enabled by setting an
 // environment variable `PYTORCH_JIT_LOG_LEVEL`.
diff --git a/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp b/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp
index e6577ae97746b..a784680aeb1f7 100644
--- a/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp
+++ b/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp
@@ -1,3 +1,4 @@
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 
@@ -48,6 +49,7 @@ static void CanonicalizeOps(Block* block) {
             "aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor") ||
         it->matches("aten::mul(Tensor self, Tensor other) -> Tensor") ||
         it->matches("aten::div(Tensor self, Tensor other) -> Tensor")) {
+      // Replace rank 0 Tensor constants with scalar constants.
       if (auto other = it->get<at::Tensor>(attr::other)) {
         if (other->dim() == 0) {
           WithInsertPoint insert_guard{*it};
@@ -64,6 +66,8 @@ static void CanonicalizeOps(Block* block) {
     } else if (it->matches(
                    "aten::chunk(Tensor self, int chunks, int dim) -> Tensor[]",
                    /*const_inputs=*/{attr::chunks, attr::dim})) {
+      // Replace aten::chunk (which returns a list) with ConstantChunk with the
+      // outputs unpacked.
       if (auto orig_outputs = getChunkOutputs(*it)) {
         WithInsertPoint guard(*it);
         auto* self = it->namedInput(attr::self);
@@ -85,6 +89,7 @@ static void CanonicalizeOps(Block* block) {
 
 void CanonicalizeOps(const std::shared_ptr<Graph>& graph) {
   CanonicalizeOps(graph->block());
+  GRAPH_DUMP("After CanonicalizeOps: ", graph);
   EliminateDeadCode(graph);
 }
 
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index 323e747fbd87e..3a760f401d863 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -306,21 +306,6 @@ struct ConstantPropagator {
     made_change_ |= initial_outputs != node->outputs().size();
   }
 
-  // An Op has runnable inputs if:
-  // - All inputs are constants.
-  // - It is an op that forwards tuples, and all inputs are constants
-  // or tuples that we know the ivalue for. We can't use known tuple ivalues
-  // for non-forwarding ops because that Tuple could contain an ivalue that is
-  // not allowed as a constant, for instance, a Tensor with a gradient.
-  bool runnableInputs(Node* n) {
-    if (std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) {
-          return v->node()->kind() == prim::Constant;
-        })) {
-      return true;
-    }
-    return false;
-  };
-
   bool noMutableValues(at::ArrayRef<Value*> values) {
     return std::none_of(values.begin(), values.end(), [](Value* v) {
       return AliasDb::isMutableType(v);
@@ -355,10 +340,13 @@ struct ConstantPropagator {
   }
 
   void ConstantPropagation(Node* n) {
-    bool runnable_inputs = runnableInputs(n);
+    bool constant_inputs =
+        std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) {
+          return v->node()->kind() == prim::Constant;
+        });
     if (n->kind() == prim::If) {
       // inline node if we can, otherwise check for simplified outputs
-      if (runnable_inputs) {
+      if (constant_inputs) {
         inlineIf(n);
       } else {
         ConstantPropagation(n->blocks());
@@ -371,7 +359,7 @@ struct ConstantPropagator {
         ConstantPropagation(n->blocks());
         removeExtraLoopOutputs(n);
       }
-    } else if (runnable_inputs && supportedNode(n)) {
+    } else if (constant_inputs && supportedNode(n)) {
       propagateNode(n);
     } else {
       ConstantPropagation(n->blocks());
@@ -414,7 +402,7 @@ bool ConstantPropagationImmutableTypes(std::shared_ptr<Graph>& graph) {
   if (made_change) {
     EliminateDeadCode(graph);
   }
-  GRAPH_DUMP("After ConstantPropagation: ", graph);
+  GRAPH_DUMP("After ConstantPropagationImmutableTypes: ", graph);
   return made_change;
 }
 
diff --git a/torch/csrc/jit/passes/erase_number_types.cpp b/torch/csrc/jit/passes/erase_number_types.cpp
index 6ac4236bbe9fa..2cd39aaf1a008 100644
--- a/torch/csrc/jit/passes/erase_number_types.cpp
+++ b/torch/csrc/jit/passes/erase_number_types.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/passes/erase_number_types.h>
 
 #include <torch/csrc/jit/ir/constants.h>
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 
 namespace torch {
@@ -67,6 +68,7 @@ void EraseNumberTypes(const std::shared_ptr<Graph>& graph) {
     SetNumTypeToTensorType(inp);
   }
   EraseNumberTypesOnBlock(graph->block());
+  GRAPH_DUMP("After EraseNumberTypes: ", graph);
 }
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/inline_fork_wait.cpp b/torch/csrc/jit/passes/inline_fork_wait.cpp
index ef733e335e9fa..75e3c8d5a06a5 100644
--- a/torch/csrc/jit/passes/inline_fork_wait.cpp
+++ b/torch/csrc/jit/passes/inline_fork_wait.cpp
@@ -1,3 +1,4 @@
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/inline_fork_wait.h>
 
 namespace torch {
@@ -57,6 +58,7 @@ void InlineForkWait(
 void InlineForkWait(const std::shared_ptr<Graph>& graph) {
   std::unordered_map<Value*, Value*> future_remap;
   InlineForkWait(graph->block(), future_remap);
+  GRAPH_DUMP("After InlineForkWait: ", graph);
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/lower_tuples.cpp b/torch/csrc/jit/passes/lower_tuples.cpp
index e53997216774d..9a9c5a9ec5f9b 100644
--- a/torch/csrc/jit/passes/lower_tuples.cpp
+++ b/torch/csrc/jit/passes/lower_tuples.cpp
@@ -4,6 +4,7 @@
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/ir/constants.h>
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 
 namespace torch {
@@ -310,6 +311,7 @@ static void EnsureNoTuples(Block* block) {
 
 void LowerAllTuples(const std::shared_ptr<Graph>& graph) {
   LowerAllTuples(graph->block());
+  GRAPH_DUMP("After LowerAllTuples: ", graph);
   EliminateDeadCode(graph->block());
   EnsureNoTuples(graph->block());
 }
@@ -325,6 +327,7 @@ void LowerSimpleTuples(Block* block) {
 
 void LowerSimpleTuples(const std::shared_ptr<Graph>& graph) {
   LowerSimpleTuples(graph->block());
+  GRAPH_DUMP("After LowerSimpleTuples: ", graph);
   EliminateDeadCode(graph);
 }
 } // namespace jit
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index 9fa68f51814f5..920fae7ec5936 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -42,6 +42,7 @@ void removePrintOps(Block* block) {
 
 void RemovePrintOps(std::shared_ptr<Graph>& graph) {
   removePrintOps(graph->block());
+  GRAPH_DUMP("After RemovePrintOps: ", graph);
 }
 
 void checkONNXCompatibility(const c10::FunctionSchema& schema) {
@@ -56,6 +57,7 @@ void checkONNXCompatibility(const c10::FunctionSchema& schema) {
     auto type = arg.type();
     if (type->kind() == TypeKind::OptionalType) {
       type = reinterpret_cast<OptionalType*>(type.get())->getElementType();
+      // recursive optional type is not supported
       AT_ASSERT(type->kind() != TypeKind::OptionalType);
     }
     if (type->kind() == TypeKind::ListType) {
@@ -91,14 +93,9 @@ void preprocessCaffe2Ops(Block* block) {
         auto type = arg.type();
         AT_ASSERT(origin_inputs_index < origin_inputs.size());
         const auto& origin_input = origin_inputs[origin_inputs_index++];
-        if (type->kind() == TypeKind::OptionalType) {
-          type = reinterpret_cast<OptionalType*>(type.get())->getElementType();
-          if (origin_input->mustBeNone()) {
-            continue;
-          } else {
-            // recursive optional type is not supported
-            AT_ASSERT(type->kind() != TypeKind::OptionalType);
-          }
+        if (type->kind() == TypeKind::OptionalType &&
+            origin_input->mustBeNone()) {
+          continue;
         }
         if (type->isSubtypeOf(TensorType::get())) {
           it->addInput(origin_input);
@@ -159,6 +156,7 @@ void preprocessCaffe2Ops(Block* block) {
 
 void PreprocessCaffe2Ops(std::shared_ptr<Graph>& graph) {
   preprocessCaffe2Ops(graph->block());
+  GRAPH_DUMP("After PreprocessCaffe2Ops: ", graph);
 }
 
 // Transform PythonOps into Nodes that match ONNX semantics.
@@ -170,6 +168,7 @@ std::shared_ptr<Graph> ToONNX(
   auto new_graph = std::make_shared<Graph>(graph->current_scope());
   std::unordered_map<Value*, Value*> env;
   BlockToONNX(graph->block(), new_graph->block(), operator_export_type, env);
+  GRAPH_DUMP("after ToONNX: ", new_graph);
   return new_graph;
 }
 
@@ -373,7 +372,7 @@ void NodeToONNX(
     // TODO: Assert it's an ATen identifier???
     // (Sometimes it's not...)
     processSymbolicOutput(n->kind().toUnqualString(), n, raw_output);
-    GRAPH_DUMP("after process output:", new_block->owningGraph());
+    GRAPH_DUMP("after processSymbolicOutput: ", new_block->owningGraph());
   };
 
   auto callPySymbolicMethod = [&](ConcretePythonOp* op) {
diff --git a/torch/csrc/jit/passes/onnx/constant_fold.cpp b/torch/csrc/jit/passes/onnx/constant_fold.cpp
index f8fc63130231e..c2c0521033aab 100644
--- a/torch/csrc/jit/passes/onnx/constant_fold.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_fold.cpp
@@ -1,3 +1,4 @@
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/onnx/constant_fold.h>
 
 #include <c10/util/Exception.h>
@@ -577,8 +578,6 @@ std::vector<Node*> getOnnxConstParentsToRemove(Node* node) {
 // known.
 void ConstantFoldONNX(Block* b, ParamMap& paramsDict, int opset_version) {
   if (opset_version < ONNX_OPSET_9) {
-    // Number of elements of 'axes' and 'ends' 1-D input tensors should be the
-    // same
     std::cerr << "Warning: Constant folding supported for only opsets >= 9. "
               << "Constant folding not applied." << std::endl;
     return;
@@ -613,7 +612,7 @@ void ConstantFoldONNX(Block* b, ParamMap& paramsDict, int opset_version) {
       continue;
     }
     // Create a new input to the block (prim::Param node output). Add a
-    // corresponding entryin valToParamMap. Replace the downstream inputs
+    // corresponding entry in valToParamMap. Replace the downstream inputs
     // with this value, and disconnect all the input values of the folded node.
     at::Tensor updatedVal = *updatedValWrapped;
     auto newSourceNodeOutput = b->addInput();
@@ -625,10 +624,10 @@ void ConstantFoldONNX(Block* b, ParamMap& paramsDict, int opset_version) {
     // Next we remove the current node that has been replaced by
     // an initializer. But before we start de-wiring this node,
     // we check if any parents of this nodes were onnx::Constant
-    // and remove them first (following proper sequence as shown
-    // below), and then remove the current node. If the parent was
-    // an initializer (not onnx::Constant) then they are all removed
-    // by eraseUnusedBlockInputs() call (below) outside the loop.
+    // and remove them first, and then remove the current node.
+    // If the parent was an initializer (not onnx::Constant) then
+    // they are all removed by the eraseUnusedBlockInputs() call
+    // (below) outside the loop.
     auto onnxConstParents =
         onnx_constant_fold::getOnnxConstParentsToRemove(node);
     node->removeAllInputs();
@@ -643,5 +642,13 @@ void ConstantFoldONNX(Block* b, ParamMap& paramsDict, int opset_version) {
   return;
 }
 
+void ConstantFoldONNX(
+    std::shared_ptr<Graph>& g,
+    ParamMap& paramsDict,
+    int opset_version) {
+  ConstantFoldONNX(g->block(), paramsDict, opset_version);
+  GRAPH_DUMP("After ConstantFoldONNX:", g);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/constant_fold.h b/torch/csrc/jit/passes/onnx/constant_fold.h
index 58d6eb3707e7a..1c54412ccd7a1 100644
--- a/torch/csrc/jit/passes/onnx/constant_fold.h
+++ b/torch/csrc/jit/passes/onnx/constant_fold.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <memory>
+
 #include <c10/util/Optional.h>
 #include <torch/csrc/jit/ir/ir.h>
 
@@ -23,7 +25,7 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
 } // namespace onnx_constant_fold
 
 void ConstantFoldONNX(
-    Block* b,
+    std::shared_ptr<Graph>& g,
     std::map<std::string, IValue>& paramDict,
     int opset_version);
 
diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.cpp b/torch/csrc/jit/passes/onnx/eval_peephole.cpp
index a1711b1cedf55..18dea16cb97ae 100644
--- a/torch/csrc/jit/passes/onnx/eval_peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/eval_peephole.cpp
@@ -1,3 +1,4 @@
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/onnx/eval_peephole.h>
 #include <torch/csrc/jit/passes/onnx/helper.h>
 #include <torch/torch.h>
@@ -36,10 +37,10 @@ std::vector<at::Tensor> getValues(
 }
 
 // This pass fuses Conv and BatchNorm into Conv node
-// Conv and BatchNorm can be fused only if inputs for Batchnorm node:
+// Conv and BatchNorm can be fused only if inputs for BatchNorm node:
 // scale, bias, mean and var are all tensors of same shape (C) and
 // if the size of the first dimension (dim 0) is the same between Conv
-// input weight and Batchnorm input scale
+// input weight and BatchNorm input scale.
 static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) {
   for (auto it = b->nodes().begin(), end = b->nodes().end(); it != end; ++it) {
     for (auto* child_block : it->blocks()) {
@@ -53,81 +54,84 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) {
       if (bnNode->kind() != onnx::BatchNormalization) {
         continue;
       }
-      auto origconvNode = *it;
+      auto oldConv = *it;
       auto epsilon = bnNode->f(attr::epsilon);
-      auto w_conv_value = getValues(origconvNode, valsToParamsMap);
-      if (w_conv_value.size() < 1 ||
-          (origconvNode->inputs().size() == 3 && w_conv_value.size() != 2)) {
+      auto convInputVals = getValues(oldConv, valsToParamsMap);
+      if (convInputVals.size() < 1 ||
+          (oldConv->inputs().size() == 3 && convInputVals.size() != 2)) {
         continue;
       }
 
-      auto bn_value = getValues(bnNode, valsToParamsMap);
-      if (bn_value.size() != 4) {
+      auto bnInputVals = getValues(bnNode, valsToParamsMap);
+      if (bnInputVals.size() != 4) {
         continue;
       }
 
-      auto bn_scale = bn_value[0].clone();
-      auto bn_B = bn_value[1].clone();
-      auto bn_mean = bn_value[2].clone();
-      auto bn_var = bn_value[3].clone();
-      auto w_conv = w_conv_value[0].clone();
-      at::Tensor b_conv;
-
-      if (!bn_scale.is_floating_point() || !bn_B.is_floating_point() ||
-          !bn_mean.is_floating_point() || !bn_var.is_floating_point() ||
-          !w_conv.is_floating_point() || bn_scale.dim() != 1 ||
-          bn_B.dim() != 1 || bn_mean.dim() != 1 || bn_var.dim() != 1 ||
-          !(bn_scale.size(0) == bn_B.size(0)) ||
-          !(bn_B.size(0) == bn_mean.size(0)) ||
-          !(bn_mean.size(0) == bn_var.size(0)) || !(w_conv.dim() > 2) ||
-          !(w_conv.size(0) == bn_scale.size(0))) {
+      // See
+      // https://github.com/onnx/onnx/blob/master/docs/Operators.md#BatchNormalization
+      auto bnScale = bnInputVals[0].clone();
+      auto bnB = bnInputVals[1].clone();
+      auto bnMean = bnInputVals[2].clone();
+      auto bnVar = bnInputVals[3].clone();
+      // See https://github.com/onnx/onnx/blob/master/docs/Operators.md#Conv
+      auto convW = convInputVals[0].clone();
+      at::Tensor convB;
+
+      if (!bnScale.is_floating_point() || !bnB.is_floating_point() ||
+          !bnMean.is_floating_point() || !bnVar.is_floating_point() ||
+          !convW.is_floating_point() || bnScale.dim() != 1 || bnB.dim() != 1 ||
+          bnMean.dim() != 1 || bnVar.dim() != 1 ||
+          !(bnScale.size(0) == bnB.size(0)) ||
+          !(bnB.size(0) == bnMean.size(0)) ||
+          !(bnMean.size(0) == bnVar.size(0)) || !(convW.dim() > 2) ||
+          !(convW.size(0) == bnScale.size(0))) {
         continue;
       }
 
-      bn_var = bn_var.add(epsilon);
-      bn_var = bn_var.sqrt();
-      bn_scale = bn_scale.div(bn_var);
+      bnVar = bnVar.add(epsilon);
+      bnVar = bnVar.sqrt();
+      bnScale = bnScale.div(bnVar);
 
       // Calculate weight
-      for (const auto i : c10::irange(w_conv.size(0))) {
-        w_conv[i] = w_conv[i].mul(bn_scale[i]);
+      for (const auto i : c10::irange(convW.size(0))) {
+        convW[i] = convW[i].mul(bnScale[i]);
       }
 
       // Calculate bias
-      if (origconvNode->inputs().size() == 3) {
-        b_conv = w_conv_value[1].clone();
-        b_conv = b_conv.sub(bn_mean);
-        b_conv = b_conv.mul(bn_scale);
-        b_conv = b_conv.add(bn_B);
+      if (oldConv->inputs().size() == 3) {
+        convB = convInputVals[1].clone();
+        convB = convB.sub(bnMean);
+        convB = convB.mul(bnScale);
+        convB = convB.add(bnB);
       } else {
-        bn_mean = bn_mean.mul(bn_scale);
-        bn_B = bn_B.sub(bn_mean);
-        b_conv = bn_B;
+        bnMean = bnMean.mul(bnScale);
+        bnB = bnB.sub(bnMean);
+        convB = bnB;
       }
 
-      Node* convNode =
+      Node* newConv =
           b->owningGraph()->create(onnx::Conv, bnNode->outputs().size());
-      for (size_t i = 0; i < convNode->outputs().size(); ++i) {
-        convNode->outputs()[i]->copyMetadata(bnNode->outputs()[i]);
+      for (size_t i = 0; i < newConv->outputs().size(); ++i) {
+        newConv->outputs()[i]->copyMetadata(bnNode->outputs()[i]);
       }
 
-      convNode->copyAttributes(*origconvNode);
-      convNode->insertBefore(bnNode);
-      convNode->addInput(origconvNode->inputs().at(0));
+      newConv->copyAttributes(*oldConv);
+      newConv->insertBefore(bnNode);
+      newConv->addInput(oldConv->inputs().at(0));
 
-      auto conv_W = b->owningGraph()->addInput();
+      auto newConvW = b->owningGraph()->addInput();
       valsToParamsMap.insert(
-          {conv_W, std::make_pair(conv_W->debugName(), w_conv)});
-      conv_W->inferTypeFrom(w_conv);
-      convNode->addInput(conv_W);
+          {newConvW, std::make_pair(newConvW->debugName(), convW)});
+      newConvW->inferTypeFrom(convW);
+      newConv->addInput(newConvW);
 
-      auto conv_B = b->addInput();
+      auto newConvB = b->addInput();
       valsToParamsMap.insert(
-          {conv_B, std::make_pair(conv_B->debugName(), b_conv)});
-      conv_B->inferTypeFrom(b_conv);
-      convNode->addInput(conv_B);
+          {newConvB, std::make_pair(newConvB->debugName(), convB)});
+      newConvB->inferTypeFrom(convB);
+      newConv->addInput(newConvB);
 
-      bnNode->replaceAllUsesWith(convNode);
+      bnNode->replaceAllUsesWith(newConv);
       bnNode->removeAllInputs();
       it->removeAllInputs();
       bnNode->destroy();
@@ -140,7 +144,11 @@ void EvalPeepholeONNX(Block* b, ParamMap& paramsDict) {
   auto valsToParamsMap = buildValueToParamsMap(b, paramsDict);
   fuseConvBatchNorm(b, valsToParamsMap);
   buildParamsMapFromValueToParamsMap(valsToParamsMap, paramsDict);
-  return;
+}
+
+void EvalPeepholeONNX(std::shared_ptr<Graph>& g, ParamMap& paramsDict) {
+  EvalPeepholeONNX(g->block(), paramsDict);
+  GRAPH_DUMP("After EvalPeepholeONNX:", g);
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.h b/torch/csrc/jit/passes/onnx/eval_peephole.h
index 9b91b2060dd11..6f8961d08fd5e 100644
--- a/torch/csrc/jit/passes/onnx/eval_peephole.h
+++ b/torch/csrc/jit/passes/onnx/eval_peephole.h
@@ -1,11 +1,15 @@
 #pragma once
 
+#include <memory>
+
 #include <torch/csrc/jit/ir/ir.h>
 
 namespace torch {
 namespace jit {
 
-void EvalPeepholeONNX(Block* b, std::map<std::string, IValue>& paramDict);
+void EvalPeepholeONNX(
+    std::shared_ptr<Graph>& g,
+    std::map<std::string, IValue>& paramDict);
 
 } // namespace jit
 
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index 65acd147b41f8..5905f755e5b6c 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -2,6 +2,7 @@
 
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/onnx/helper.h>
 
@@ -25,9 +26,11 @@ bool isRNN(const Node* node) {
 }
 
 bool isNopTranspose(const std::vector<int64_t>& perm) {
-  for (int64_t i = 0, perm_size = perm.size(); i < perm_size; i++)
-    if (perm[i] != i)
+  for (int64_t i = 0, perm_size = perm.size(); i < perm_size; i++) {
+    if (perm[i] != i) {
       return false;
+    }
+  }
   return true;
 }
 
@@ -106,6 +109,10 @@ c10::optional<size_t> fusibleExpandTo(
   return to.size() - from.size();
 }
 
+// Fuses expand calls into ONNX operators, because it is
+// easier for non-strided backends to more efficiently do broadcasts if this
+// is local information. This optimization is not useful for PyTorch as
+// 'expand' is free.
 void fuseBroadcast(Block* b) {
   for (auto n : b->nodes()) {
     for (auto* child_block : n->blocks()) {
@@ -134,8 +141,9 @@ void fuseBroadcast(Block* b) {
       // not generated from symbolic), but if for some reason we don't
       // have it, we need to skip.
       if (!unexpanded_input->isCompleteTensor() ||
-          !n->output()->isCompleteTensor())
+          !n->output()->isCompleteTensor()) {
         continue;
+      }
 
       // Not all broadcasts are supported by ONNX broadcast.
       c10::optional<size_t> axis = fusibleExpandTo(
@@ -150,8 +158,9 @@ void fuseBroadcast(Block* b) {
               .sizes()
               .concrete_sizes()
               .value()); // to
-      if (axis == c10::nullopt)
+      if (axis == c10::nullopt) {
         continue;
+      }
 
       n->replaceInput(position, unexpanded_input);
       if (!expand_node->hasUses()) {
@@ -239,7 +248,6 @@ void fuseTransposeIntoGemm(Block* b) {
 //   the removeNopPacking pass removes the packing operations
 //   entirely by pairing them with their inverse PadPacked. If the
 //   input graph does not pair the operations, export will fail.
-
 void pushPackingPastRnn(Block* b) {
   for (auto it = b->nodes().begin(); it != b->nodes().end(); ++it) {
     auto* n = *it;
@@ -431,11 +439,11 @@ void fixDefaultRNNState(
   //    traced as a ConstantOfShape with the expected Shape.
   // 3- When the batch size is fixed, everything works great as well.
   // 4- When h0 and c0 are specified but are not inputs of the model (they are
-  // Constants)
-  //    and the batch size is variable, the model should be saved with a batch
-  //    size of 1 (or an error will occur), and we save the value of h0 and c0
-  //    with a batch size of 1. When the model is then called with a different
-  //    batch size value, h0 and c0 are broadcasted to get the right shape.
+  //    Constants) and the batch size is variable, the model should be saved
+  //    with a batch size of 1 (or an error will occur), and we save the value
+  //    of h0 and c0 with a batch size of 1. When the model is then called with
+  //    a different batch size value, h0 and c0 are broadcasted to get the right
+  //    shape.
   // Recognize that last pattern here (4) and fix the shape.
   // Note that for multi-layer RNNs there will be a Slice operation between the
   // Constant and the RNN.
@@ -545,6 +553,12 @@ static bool isSafeToSpeculate(Node* n) {
   return n->kind() == onnx::Transpose;
 }
 
+// Moves ops outside of control flow blocks so that they are always executed,
+// no matter the result of the control flow conditions.
+// Needed only so that the split pass of the ONNX optimizer will put the ops
+// into the init_net.
+// TODO: Once the code in caffe2/python/onnx/backend.py no longer calls
+// optimize_onnx, delete this function.
 static void speculateOps(Block* block) {
   for (auto it = block->nodes().begin(), end = block->nodes().end();
        it != end;) {
@@ -554,13 +568,15 @@ static void speculateOps(Block* block) {
     for (auto b : n->blocks()) {
       speculateOps(b);
     }
-    if (!isSafeToSpeculate(n))
+    if (!isSafeToSpeculate(n)) {
       continue;
+    }
     // XXX - only works for nodes with a single input
     // move node n outside of the control flow it is nested in
     auto node_input = n->input()->node();
-    if (node_input->owningBlock() == n->owningBlock())
+    if (node_input->owningBlock() == n->owningBlock()) {
       continue;
+    }
     // Skip if output of this node is part of block output.
     bool is_block_output = false;
     for (auto node_output : n->outputs()) {
@@ -580,8 +596,9 @@ static void speculateOps(Block* block) {
     // find the control flow node in the same block as node_input that contains
     // Node n
     auto control_flow_node = n->owningBlock()->owningNode();
-    while (control_flow_node->owningBlock() != node_input->owningBlock())
+    while (control_flow_node->owningBlock() != node_input->owningBlock()) {
       control_flow_node = control_flow_node->owningBlock()->owningNode();
+    }
     // put the node right before this flow node
     n->moveBefore(control_flow_node);
   }
@@ -669,6 +686,7 @@ static void eraseListConstruct(Block* block, int opset_version) {
 
 static void eraseListUnpack(Block* block, int opset_version);
 
+// Replace prim::ListUnpack with onnx::SequenceAt.
 static void eraseListUnpack(Node* n, int opset_version) {
   for (auto b : n->blocks()) {
     eraseListUnpack(b, opset_version);
@@ -708,9 +726,17 @@ static void eraseListUnpack(Block* block, int opset_version) {
   }
 }
 
-// For ops such as meshgrid where output is a list of Tensors
-// (returns prim::ListConstruct), we need to unpack the list
-// before the pass which deletes ListConstruct.
+// From:
+//   %list = ListConstruct(%x);
+//   %unpacked = ListUnpack(%list);
+//   do_something(%unpacked);
+//
+// To:
+//   %list = ListConstruct(%x);
+//   %unpacked = ListUnpack(%list);
+//   do_something(%x)
+//
+// The ListConstruct and ListUnpack may now be dead code.
 static void fuseListConstructListUnpack(Block* b) {
   for (auto it = b->nodes().begin(), end = b->nodes().end(); it != end; ++it) {
     for (auto* child_block : it->blocks()) {
@@ -794,8 +820,9 @@ static void fuseLogSoftmaxNllLoss(Block* b) {
         // (%10)
         origLogSoftmaxNode = prev->input(0)->node();
         auto transpose = origLogSoftmaxNode->input(0)->node();
-        if (transpose->inputs().size() > 0)
+        if (transpose->inputs().size() > 0) {
           origLogSoftmaxNode->replaceInput(0, transpose->inputs().at(0));
+        }
       } else if (
           prev->kind() == onnx::Reshape &&
           prev->input(0)->node()->kind() == onnx::Transpose &&
@@ -933,10 +960,10 @@ static void removeSequenceSplitConcat(Block* b) {
   }
 }
 
+// Work around limitation from ONNX that the block input cannot be used directly
+// as block output. Inserts an Identity node inside the block, and have the
+// block return the output of the Identity.
 static void insertIdentityForInputUsedAsOutput(Block* b) {
-  // Resolving limitation from ONNX that the block input cannot be used directly
-  // as block output. Inserting an Identity node inside
-  // the block, linking with the value as workaround.
   for (auto out : b->outputs()) {
     auto n = out->node();
     if (nullptr != n && n->kind() == prim::Param) {
@@ -957,21 +984,11 @@ static void insertIdentityForInputUsedAsOutput(Block* b) {
 
 // This optimization does ONNX-specific peephole optimizations.
 //
-// At the moment, here are the optimizations it does:
-//  - This optimization fuses expand calls into ONNX operators, because it is
-//    easier for non-strided backends to more efficiently do broadcasts if this
-//    is local information.  This optimization is not useful for PyTorch as
-//    'expand' is free.
-//  - Fusing of consecutive transposes
-//  - Elimination of NOP transposes
-//  - Fusing of transposes into Gemm
-//  - Elimination of PaddedSequences
-//
 // Before you write an optimization here, ask yourself, "Could I do this
 // optimization on ATen operators"?  If so, you should seriously consider
 // writing your optimization in jit/passes/peephole.cpp rather than
 // here, as it will be generally applicable to the JIT as well.  The
-// optimizations here are ONLY applied on ONNX update
+// optimizations here are ONLY applied on ONNX export.
 void PeepholeOptimizeONNX(
     std::shared_ptr<Graph>& graph,
     int opset_version,
@@ -1004,6 +1021,8 @@ void PeepholeOptimizeONNX(
   removeMaxPoolUnusedOutput(graph->block());
   removeSequenceSplitConcat(graph->block());
   insertIdentityForInputUsedAsOutput(graph->block());
+
+  GRAPH_DUMP("After PeepholeOptimizeONNX", graph);
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/onnx/prepare_division_for_onnx.cpp b/torch/csrc/jit/passes/onnx/prepare_division_for_onnx.cpp
index 709b174e94285..22ff8e84a98a3 100644
--- a/torch/csrc/jit/passes/onnx/prepare_division_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/prepare_division_for_onnx.cpp
@@ -1,10 +1,13 @@
 #include <torch/csrc/jit/passes/onnx/prepare_division_for_onnx.h>
 
 #include <torch/csrc/jit/ir/constants.h>
+#include <torch/csrc/jit/jit_log.h>
 
 namespace torch {
 namespace jit {
 
+// onnx only supports tensors, but 1 / 2 = 0.5 and tensor(1) / tensor(2) = 0,
+// so before converting the ints to tensors we need to cast them to floats.
 static void PrepareDivisionForONNXOnBlock(Block* block) {
   for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
     for (auto sub : it->blocks()) {
@@ -35,6 +38,7 @@ static void PrepareDivisionForONNXOnBlock(Block* block) {
 
 void PrepareDivisionForONNX(const std::shared_ptr<Graph>& graph) {
   PrepareDivisionForONNXOnBlock(graph->block());
+  GRAPH_DUMP("After PrepareDivisionForONNX: ", graph);
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/onnx/preprocess_for_onnx.cpp b/torch/csrc/jit/passes/onnx/preprocess_for_onnx.cpp
index 7191476a1f422..5ab090257b3ea 100644
--- a/torch/csrc/jit/passes/onnx/preprocess_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/preprocess_for_onnx.cpp
@@ -66,10 +66,9 @@ void FuseWithListUnpack(Node* n) {
 
   TORCH_INTERNAL_ASSERT(n->outputs().size() == 1);
   // 1. Add internal input "_outputs" to node, so that later symbolic function
-  // conversion
-  //    is aware of the number of outputs.
+  //    conversion is aware of the number of outputs.
   // 2. Add the exact number of outputs to n, copy metadata and replace uses of
-  // listUnpack outputs.
+  //    listUnpack outputs.
   n->i_(
       Symbol::fromQualString("attr::_outputs"),
       static_cast<int64_t>(listUnpack_node->outputs().size()));
@@ -169,24 +168,22 @@ static void ReplaceAddWithConcat(Block* b) {
 // before the pass
 // graph(%x.1 : Float(2, 3, strides=[3, 1], requires_grad=0, device=cpu)):
 //   %1 : None = prim::Constant()
-//   %2 : int[] = aten::size(%x.1) # <string>:7:9
+//   %2 : int[] = aten::size(%x.1)
 //   %a.1 : int, %b.1 : int = prim::ListUnpack(%2)
 //   %5 : int[] = prim::ListConstruct(%a.1, %b.1)
-//   %6 : Tensor = aten::new_zeros(%x.1, %5, %1, %1, %1, %1) #
-//   test/onnx/test_pytorch_onnx_onnxruntime.py:1757:23 return (%6)
+//   %6 : Tensor = aten::new_zeros(%x.1, %5, %1, %1, %1, %1)
 //
 // after the pass:
 // graph(%x.1 : Float(2, 3, strides=[3, 1], requires_grad=0, device=cpu)):
 //   %1 : None = prim::Constant()
-//   %2 : int[] = aten::size(%x.1) # <string>:7:9
+//   %2 : int[] = aten::size(%x.1)
 //   %7 : Tensor = onnx::Constant[value={0}]()
 //   %8 : Tensor = onnx::Gather(%2, %7)
 //   %9 : Tensor = onnx::Constant[value={1}]()
 //   %10 : Tensor = onnx::Gather(%2, %9)
 //   %a.1 : int, %b.1 : int = prim::ListUnpack(%2)
 //   %5 : int[] = prim::ListConstruct(%8, %10)
-//   %6 : Tensor = aten::new_zeros(%x.1, %5, %1, %1, %1, %1) #
-//   test/onnx/test_pytorch_onnx_onnxruntime.py:1757:23 return (%6)
+//   %6 : Tensor = aten::new_zeros(%x.1, %5, %1, %1, %1, %1)
 static void fuseListAndListUnpack(Block* b) {
   for (auto it = b->nodes().begin(), end = b->nodes().end(); it != end; ++it) {
     for (auto* child_block : it->blocks()) {
@@ -222,8 +219,11 @@ static void fuseListAndListUnpack(Block* b) {
 
 void PreprocessForONNX(std::shared_ptr<Graph>& graph) {
   FuseWithListUnpack(graph->block());
+  GRAPH_DUMP("After FuseWithListUnpack: ", graph);
   ReplaceAddWithConcat(graph->block());
+  GRAPH_DUMP("After ReplaceAddWithConcat: ", graph);
   fuseListAndListUnpack(graph->block());
+  GRAPH_DUMP("After fuseListAndListUnpack: ", graph);
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index c0d36361efa6b..19d2a6cdc6fd3 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -332,6 +332,7 @@ static void PrepareForRemoveMutations(MutationRemover& mr, Block* b) {
 static void PrepareForRemoveMutations(std::shared_ptr<Graph> graph) {
   MutationRemover mr(graph);
   PrepareForRemoveMutations(mr, graph->block());
+  GRAPH_DUMP("After PrepareForRemoveMutations: ", graph);
 }
 
 // findSubModuleAttr function chases getAttr chains backwards to locate the
@@ -769,10 +770,10 @@ void InplaceConverter::replaceAttrWithInplaceOps(
 void InplaceConverter::convertGetSetAttrToInplaceOps(Block* block) {
   std::unordered_map<std::string, Value*> attr_name_value_map = {};
   std::unordered_map<Node*, std::string> attr_node_fullname_map = {};
-  // First pass over graph, to gather all attribute names, and their intial
+  // First pass over graph, to gather all attribute names, and their initial
   // values. Create dummy initial values for attributes if necessary. By the end
   // of this pass, these dummy initial values should have zero uses, and can be
-  // safely removed. Otherwise it will imply error in model for using
+  // safely removed. Otherwise it will imply an error in the model for using
   // uninitialized values.
   gatherAttrNameInitialValueMap(
       block, attr_name_value_map, attr_node_fullname_map);
diff --git a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
index 4aca272a4f4b1..a6a9eb5c788c3 100644
--- a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
+++ b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
@@ -1,6 +1,6 @@
-#include <torch/csrc/jit/passes/onnx/scalar_type_analysis.h>
-
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/onnx/scalar_type_analysis.h>
 
 namespace torch {
 namespace jit {
@@ -257,7 +257,7 @@ static void UpdateScalarTypeForInputs(
         (input_scalar_type && (*input_scalar_type != scalar_type))) {
       if (input->node()->kind() == onnx::Constant) {
         // Fix up the scalar directly instead of inserting a cast operator.
-        // NOTE: Keep only the else branch once constant_folding is enabled by
+        // TODO: Keep only the else branch once constant_folding is enabled by
         // default.
         at::Tensor val = input->node()->t(attr::value);
         at::Tensor new_val = val.to(scalar_type);
@@ -381,6 +381,7 @@ void ScalarTypeAnalysisForONNX(
   if (lowprecision_cast) {
     LowPrecisionCastForStandardOpsONNX(graph->block(), opset_version);
   }
+  GRAPH_DUMP("After ScalarTypeAnalysisForONNX: ", graph);
 }
 
 void ScalarTypeAnalysisNodeForONNX(Node* n) {
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 96afc50418250..c4ca729f2c914 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -45,8 +45,8 @@ TypePtr MergeInferredType(TypePtr existing_type, TypePtr inferred_type) {
 
   if (new_tensor_type && old_tensor_type) {
     if (!old_tensor_type->device()) {
-      // device not avaible means this is an invalid tensor type (most likely an
-      // empty one) return inferred type directly.
+      // device not available means this is an invalid tensor type (most likely
+      // an empty one) -> return inferred type directly.
       return new_tensor_type;
     }
     auto type = old_tensor_type;
@@ -1582,7 +1582,8 @@ size_t ONNXAssignOutputShape(
           auto& new_var = THPVariable_Unpack(list_elem);
           TORCH_CHECK(
               var.scalar_type() == new_var.scalar_type(),
-              "Unsupported sequence type in model outputs. ONNX supports sequences of elements of the same data type.");
+              "Unsupported sequence with mixed elment types in model outputs. "
+              "ONNX supports only sequences of elements of the same data type.");
         }
         auto elem_type = graph->outputs()
                              .at(outputs_index)
@@ -1636,9 +1637,8 @@ size_t ONNXAssignOutputShape(
     // outputs have been disabled.
   } else {
     std::string msg =
-        "Only tuples, lists and Variables are supported as JIT inputs/outputs. "
-        "Dictionaries and strings are also accepted, but their usage is not "
-        "recommended. Here, received an input of unsupported type: ";
+        ("Model output has unsupported type. See "
+         "https://pytorch.org/docs/stable/onnx.html#types. Got type: ");
     msg += THPUtils_typename(output_obj);
     throw std::runtime_error(msg);
   }
@@ -1665,6 +1665,7 @@ void ONNXAssignOutputShape(
       "Incorrect number of elements provided as example outputs.");
 
   Py_DECREF(py_obj);
+  GRAPH_DUMP("After ONNXAssignOutputShape", graph);
 }
 
 void ONNXShapeTypeInference(
diff --git a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
index f027e73ee684a..4f4643fb066a7 100644
--- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
+++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
@@ -4,6 +4,7 @@
 #include <torch/csrc/jit/ir/constants.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/ir/subgraph_matcher.h>
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/onnx/helper.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
 #include <stack>
@@ -480,6 +481,7 @@ void UnpackQuantizedWeights(
       qconv3d_relu,
       "quantized::conv3d_unpack",
       QuantizedParamsType::CONV);
+  GRAPH_DUMP("After UnpackQuantizedWeights: ", graph);
 }
 
 // Caffe2 expects quantized ops to be in NHWC format while pytorch inputs are in
@@ -536,6 +538,7 @@ void insertPermutes(
   insertPermutesHelper(graph, paramsDict, qconv);
   insertPermutesHelper(graph, paramsDict, qconv_relu);
   insertPermutesHelper(graph, paramsDict, qconv_transpose);
+  GRAPH_DUMP("After insertPermutes: ", graph);
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/peephole.cpp b/torch/csrc/jit/passes/peephole.cpp
index 9caa33fac34e9..bacc4bc190ea3 100644
--- a/torch/csrc/jit/passes/peephole.cpp
+++ b/torch/csrc/jit/passes/peephole.cpp
@@ -40,12 +40,7 @@ struct PeepholeOptimizeImpl {
   // The intent for this optimization pass is to catch all of the small, easy to
   // catch peephole optimizations you might be interested in doing.
   //
-  // Right now, it does:
-  //    - Eliminate no-op 'expand' nodes
-  //    - Simply x.t().t() to x
-  //
   // TODO: Decide what kind of fixed point strategy we will have
-  //
   bool optimizeBlock(Block* block) {
     bool changed = false;
     for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
@@ -62,6 +57,9 @@ struct PeepholeOptimizeImpl {
       // canonicalize those
       if (node->matches(
               "aten::_grad_sum_to_size(Tensor(a) self, int[]? size) -> Tensor(a)")) {
+        // Eliminate no-op _grad_sum_to_size.
+        // TODO: this doesn't work with Scalar-Tensor ops! We should
+        // canonicalize those
         if (node->input(1)->mustBeNone()) {
           GRAPH_UPDATE(
               getHeader(node),
@@ -139,7 +137,7 @@ struct PeepholeOptimizeImpl {
         if (input_node->kind() == prim::NumToTensor) {
           GRAPH_UPDATE(
               getHeader(node),
-              " (x.NumToTensor().TensorToNum() == x.NumToTensor()) is replaced with ",
+              " (x.NumToTensor() == x) is replaced with ",
               node->input()->debugName());
           node->output()->replaceAllUsesWith(input_node->input());
           changed = true;
@@ -197,6 +195,10 @@ struct PeepholeOptimizeImpl {
               IValue ival(*ptt->sizes()[norm_index]);
               auto const_sizes_val = node->owningGraph()->insertConstant(ival);
               node->output()->replaceAllUsesWith(const_sizes_val);
+              GRAPH_UPDATE(
+                  getHeader(node),
+                  " (x.size(dim)) is replaced with constant ",
+                  const_sizes_val->debugName());
               changed = true;
             }
           }
@@ -211,6 +213,10 @@ struct PeepholeOptimizeImpl {
           IValue ival(at::isFloatingType(dtype));
           auto new_constant = node->owningGraph()->insertConstant(ival);
           node->output()->replaceAllUsesWith(new_constant);
+          GRAPH_UPDATE(
+              getHeader(node),
+              " (x.is_floating_point()) is replaced with ",
+              new_constant->debugName());
           changed = true;
         }
       } else if (
@@ -223,6 +229,11 @@ struct PeepholeOptimizeImpl {
           IValue ival(at::isComplexType(dtype));
           auto new_constant = node->owningGraph()->insertConstant(ival);
           node->output()->replaceAllUsesWith(new_constant);
+          GRAPH_UPDATE(
+              getHeader(node),
+              " (x.is_complex()) is replaced with ",
+              new_constant->debugName());
+          changed = true;
         }
       } else if (
           node->matches("prim::dtype(Tensor a) -> int") && shape_peepholes_) {
@@ -362,25 +373,13 @@ bool FuseAddMM(Block* block) {
             "aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
             /*const_inputs=*/attr::alpha)) {
       // z + x.mm(y) == z.addmm(x, y) == x.mm(y) + z
-      // This optimization has been disabled at the moment, because it's not
-      // helpful at all until we will be able to represent torch.addmm(a, b,
-      // c, out=a). That's because addmm dispatches internally to gemm, which
-      // computes:
-      //   C = beta * C + alpha * A @ B
-      // but aten::addmm(a, b, c, 1, 1) is really:
-      //   D = beta * C + alpha * A @ B
-      // and because it works out of place on C, we're only trading off an
-      // explicit add for a copy inside the addmm function. Note that it
-      // doesn't even result in fewer reads, because mm won't even load C
-      // (because beta
-      // == 0 for it).
       if (node->get<at::Scalar>(attr::alpha).value().toDouble() == 1.) {
         // Look for mm from both sides of the add
         for (const auto mm_side : c10::irange(2)) {
           // Add will accept tensors of mismatched scalar types, as long as
-          // one of them is a scalar. Addmm will throw in that case, so we can
-          // only perform this fusion if we're sure that it is correct, and
-          // for that we need the add_mat_type. An alternative would be to
+          // one of them is a scalar, but addmm will throw in that case, so we
+          // can only perform this fusion if we're sure that it is correct,
+          // and for that we need the add_mat_type. An alternative would be to
           // insert a type_as conditional on the tensor shape being a scalar,
           // but that might add overhead, and make analysis harder.
           auto add_mat_type =
@@ -415,9 +414,8 @@ bool FuseAddMM(Block* block) {
             }
 
             // We insert the type_as if we're sure that the added element is a
-            // scalar, and we either don't know what is the type of the
-            // scalar, or know the type, and know that it's
-            // mismatched.
+            // scalar, and we either don't know the type of the scalar, or
+            // know that it's mismatched.
             if (add_mat_type->sizes().size() &&
                 *add_mat_type->sizes().size() == 0 &&
                 !mustBeEqual(add_mat_type->scalarType(), mat_scalar_type)) {
@@ -468,14 +466,25 @@ bool FuseAddMM(Block* block) {
 }
 
 // FuseAddMM is a separate pass from peephole optimize because it is currently
-// only done as an optimization for onnx.
-// as it is today, fusing add + mm has no benefit within PyTorch running ATen
-// ops. However, we rely on seeing the fused version of addmm for ONNX export,
-// since after ONNX translation we would see redundant Gemm ops with sub-optimal
-// inputs. This flag is exposed so that ONNX export can pass `true` to get the
-// fused behavior, but normal JIT peephole optimization is left alone.
+// used for exporting to ONNX.
+// Today, fusing add + MM has no benefit within PyTorch running ATen
+// ops. However, we rely on seeing the fused version of AddMM for ONNX export,
+// since otherwise after ONNX translation we would see redundant Gemm ops with
+// sub-optimal inputs.
+// It won't be helpful for ATen until we're able to represent
+//   torch.addmm(a, b, c, out=a).
+// That's because addmm dispatches internally to gemm, which computes:
+//   C = beta * C + alpha * A @ B
+// but aten::addmm(a, b, c, 1, 1) is really:
+//   D = beta * C + alpha * A @ B
+// and because it works out of place on C, we're only trading off an
+// explicit add for a copy inside the addmm function. Note that it
+// doesn't even result in fewer reads, because mm won't even load C
+// (because beta == 0 for it).
 bool FuseAddMM(const std::shared_ptr<Graph>& graph) {
-  return FuseAddMM(graph->block());
+  bool changed = FuseAddMM(graph->block());
+  GRAPH_DUMP("After FuseAddMM: ", graph);
+  return changed;
 }
 
 bool PeepholeOptimize(
diff --git a/torch/csrc/jit/passes/peephole_alias_sensitive.cpp b/torch/csrc/jit/passes/peephole_alias_sensitive.cpp
index 0a4324daf3985..53be34fbc37f9 100644
--- a/torch/csrc/jit/passes/peephole_alias_sensitive.cpp
+++ b/torch/csrc/jit/passes/peephole_alias_sensitive.cpp
@@ -51,9 +51,9 @@ struct PeepholeOptimizeAliasSensitiveImpl {
         auto kind = node->kind();
         int64_t output_size =
             kind == aten::conv1d ? 3 : (kind == aten::conv2d ? 4 : 5);
-        // this is to handle potential resize_ calls, however unlikely
-        // if we add more checks related to resize_ in the graph,
-        // factor this out like collectResizeSet in shape_analysis
+        // This is to handle potential resize_ calls, however unlikely.
+        // If we add more checks related to resize_ in the graph,
+        // factor this out like collectResizeSet in shape_analysis.
         if (!aliasDb_->hasWriters(node->output())) {
           for (const Use& dim_use : dim_uses) {
             replaceWithIValue(dim_use.user->output(), output_size);
diff --git a/torch/csrc/jit/passes/remove_inplace_ops.cpp b/torch/csrc/jit/passes/remove_inplace_ops.cpp
index 30b528b64cdfc..10c9ab42c8557 100644
--- a/torch/csrc/jit/passes/remove_inplace_ops.cpp
+++ b/torch/csrc/jit/passes/remove_inplace_ops.cpp
@@ -1,81 +1,8 @@
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/remove_inplace_ops.h>
 
 namespace torch {
 namespace jit {
-namespace {
-static const std::unordered_map<NodeKind, NodeKind> inPlaceToOutOfPlace = {
-    {aten::add_, aten::add},
-    {aten::sub_, aten::sub},
-    {aten::div_, aten::div},
-    {aten::mul_, aten::mul},
-    {aten::masked_fill_, aten::masked_fill},
-    {aten::zero_, aten::zeros_like},
-    {aten::fill_, aten::full_like}};
-
-// This is a horrible no good awful hack to "fill in" the TensorOptions
-// arguments of zeros_like and full_like so that the defaults are filled
-// in.  Ugh.  Would be better to just run the frontend to get the correct
-// arity here.
-static const std::unordered_map<NodeKind, int> expectedInputCount = {
-    {aten::zero_, 6},
-    {aten::fill_, 7}};
-
-bool isInplaceOp(const Node* node) {
-  return inPlaceToOutOfPlace.count(node->kind()) != 0;
-}
-
-// Remove all in-place ops and replace them with out-of-place equivalents.
-// e.g.
-//   %foo = aten::add_(%foo, %n)
-// becomes
-//   %foo.2 = aten::add(%foo, %n)
-//
-// NOTE: this is NOT SAFE, since it assumes that the LHS is not aliased by
-// another value. This is only to avoid breaking ONNX export; when alias
-// analysis is done we can emit a warning if someone tries to export.
-void RemoveInplaceOps(Block* block) {
-  auto graph = block->owningGraph();
-  auto it = block->nodes().begin();
-  while (it != block->nodes().end()) {
-    auto node = *it;
-    ++it;
-    for (auto block : node->blocks()) {
-      RemoveInplaceOps(block);
-    }
-
-    if (isInplaceOp(node)) {
-      // create a replacement out of place op
-      auto newNode = graph->create(inPlaceToOutOfPlace.at(node->kind()));
-      newNode->insertBefore(node);
-      newNode->setScope(node->scope());
-      // copy inputs
-      for (auto input : node->inputs()) {
-        newNode->addInput(input);
-      }
-
-      int additionalInputCount = 0;
-      if (expectedInputCount.find(node->kind()) != expectedInputCount.end()) {
-        additionalInputCount = expectedInputCount.at(node->kind()) -
-            static_cast<int>(newNode->inputs().size());
-      }
-
-      for (int i = 0; i < additionalInputCount; ++i) {
-        auto noneNode = graph->createNone();
-        noneNode->insertBefore(newNode);
-        newNode->addInput(noneNode->output());
-      }
-
-      // Create a new output node and replace all uses of self with it
-      newNode->output()->copyMetadata(node->output());
-      node->replaceAllUsesWith(newNode);
-      node->inputs()[0]->replaceAllUsesAfterNodeWith(
-          newNode, newNode->output());
-      node->destroy();
-    }
-  }
-}
-} // namespace
-
 // Handles special case of binary inplace ops, where the first input node
 // has a lower type precedence than the second input node. When the
 // inplace node is converted to a regular op, this information is lost and
@@ -85,18 +12,19 @@ void RemoveInplaceOps(Block* block) {
 // are the same.
 // An example scenario would be:
 // Before:
-// graph(%0 : Float),
-//        %1 : Half):
-//   %4 : Float = onnx::Cast[to=1](%1)
-//   %5 : Float = onnx::Add(%4, %0)
+// graph(%0 : Half),
+//       %1 : Float):
+//   # Should result in a Half, but after translation to out-of-place,
+//   # would become a Float b/c Half+Float -> Float.
+//   Float : = aten::add_(%0, %1)
 //   ...
 // After:
-// graph(%0 : Float),
-//        %1 : Half):
-//   %4 : Half = onnx::Cast[to=10](%0)
-//   %5 : Half = onnx::Add(%1, %4)
+// graph(%0 : Half),
+//       %1 : Float):
+//   %2 : Half = aten::type_as(%1, %0)
+//   # Half + Half will result in correct dtype.
+//   Half : = aten::add_(%0, %2)
 //   ...
-
 void ImplicitCastForBinaryInplaceOps(Block* b) {
   for (auto it = b->nodes().begin(), end = b->nodes().end(); it != end; ++it) {
     for (auto* child_block : it->blocks()) {
@@ -106,30 +34,25 @@ void ImplicitCastForBinaryInplaceOps(Block* b) {
     // Check type if inplace operation is a binary node
     if ((it->kind() == aten::add_) || (it->kind() == aten::sub_) ||
         (it->kind() == aten::mul_) || (it->kind() == aten::div_)) {
-      auto orignalInputs = it->inputs();
-      if (orignalInputs.at(0) == orignalInputs.at(1)) {
+      auto originalInputs = it->inputs();
+      if (originalInputs.at(0) == originalInputs.at(1)) {
         continue;
       }
       TensorTypePtr firstInp_tensor =
-          orignalInputs.at(0)->type()->cast<TensorType>();
+          originalInputs.at(0)->type()->cast<TensorType>();
       TensorTypePtr secondInp_tensor =
-          orignalInputs.at(1)->type()->cast<TensorType>();
+          originalInputs.at(1)->type()->cast<TensorType>();
       if (!(firstInp_tensor) || !(secondInp_tensor) ||
           !(firstInp_tensor->scalarType().has_value())) {
         continue;
       }
       auto newInputNode = it->owningGraph()->create(aten::type_as, 1);
       newInputNode->insertBefore(*it);
-      newInputNode->addInput(orignalInputs.at(1));
-      newInputNode->addInput(orignalInputs.at(0));
+      newInputNode->addInput(originalInputs.at(1));
+      newInputNode->addInput(originalInputs.at(0));
       it->replaceInput(1, newInputNode->outputs().at(0));
     }
   }
 }
-
-void RemoveInplaceOps(const std::shared_ptr<Graph>& graph) {
-  ImplicitCastForBinaryInplaceOps(graph->block());
-  RemoveInplaceOps(graph->block());
-}
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/remove_inplace_ops.h b/torch/csrc/jit/passes/remove_inplace_ops.h
index e597da64860be..50cf9ab33c695 100644
--- a/torch/csrc/jit/passes/remove_inplace_ops.h
+++ b/torch/csrc/jit/passes/remove_inplace_ops.h
@@ -7,8 +7,6 @@
 namespace torch {
 namespace jit {
 // see .cpp for docs
-TORCH_API void RemoveInplaceOps(const std::shared_ptr<Graph>& graph);
-
 TORCH_API void ImplicitCastForBinaryInplaceOps(Block* block);
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/remove_mutation.cpp b/torch/csrc/jit/passes/remove_mutation.cpp
index 758b596e12ec2..01c2cb81f1034 100644
--- a/torch/csrc/jit/passes/remove_mutation.cpp
+++ b/torch/csrc/jit/passes/remove_mutation.cpp
@@ -256,12 +256,12 @@ bool MutationRemover::RemoveTensorMutation(Block* block) {
     // For the remainder of the function, x0 will have the
     // same aliasing relationships as the original x.
     // To avoid rebuilding the entire alias db, we can replace
-    // the memory dag element of x with x0.
+    // the memory DAG element of x with x0.
     getOrCreateAliasDb()->replaceWithNewValue(
         mutated_value, new_node->output());
 
     // it is an invariant that all mutable types have an element in the memory
-    // dag so we must regive x an alias db element. We have already verified
+    // DAG so we must regive x an alias db element. We have already verified
     // that the mutated value is a fresh alias with a single use.
     getOrCreateAliasDb()->createValue(mutated_value);
 
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 26b15e429a55b..bb528e1ea7145 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -203,7 +203,7 @@ void initJITBindings(PyObject* module) {
           "_jit_pass_onnx_eval_peephole",
           [](std::shared_ptr<Graph>& graph,
              std::map<std::string, IValue>& paramsDict) {
-            EvalPeepholeONNX(graph->block(), paramsDict);
+            EvalPeepholeONNX(graph, paramsDict);
             return paramsDict;
           },
           pybind11::return_value_policy::move)
@@ -216,7 +216,7 @@ void initJITBindings(PyObject* module) {
              std::map<std::string, IValue>& paramsDict,
              int opset_version) {
             ConstantFoldONNX(
-                graph->block(),
+                graph,
                 paramsDict,
                 opset_version); // overload resolution
             return paramsDict;
@@ -407,9 +407,6 @@ void initJITBindings(PyObject* module) {
           py::arg("g"),
           py::arg("value_name_pairs") =
               std::vector<std::pair<std::string, std::string>>())
-      .def(
-          "_jit_pass_remove_inplace_ops",
-          [](const std::shared_ptr<Graph>& g) { return RemoveInplaceOps(g); })
       .def("_jit_pass_constant_pooling", ConstantPooling)
       .def(
           "_jit_pass_create_functional_graphs",
@@ -559,9 +556,6 @@ void initJITBindings(PyObject* module) {
                 python::unflatten(vars, desc));
           })
       .def("_jit_pass_onnx_block", BlockToONNX)
-      .def(
-          "_jit_pass_onnx_encapsulate_pattern_into_subblock",
-          EncapsulatePatternIntoSubblock)
       .def(
           "_jit_onnx_convert_pattern_from_subblock", ConvertPatternFromSubblock)
       .def("_jit_pass_fixup_onnx_controlflow_node", FixupONNXControlflowNode)
diff --git a/torch/csrc/jit/python/python_arg_flatten.cpp b/torch/csrc/jit/python/python_arg_flatten.cpp
index 30892ed3a9804..8d425bb751b25 100644
--- a/torch/csrc/jit/python/python_arg_flatten.cpp
+++ b/torch/csrc/jit/python/python_arg_flatten.cpp
@@ -68,18 +68,18 @@ void flatten_rec(PyObject* obj, ParsedArgs& args) {
     args.desc.structure.push_back(D::Variable);
   } else if (strcmp(THPUtils_typename(obj), "NoneType") == 0) {
     args.desc.structure.push_back(D::NoneType);
-  } else if (PyBool_Check(obj)) { // Wrap integers in bool tensors
+  } else if (PyBool_Check(obj)) { // Wrap bools in Bool tensors
     at::Tensor var = scalar_to_tensor(at::Scalar(THPUtils_unpackBool(obj)));
     args.vars.push_back(var);
     args.desc.metadata.emplace_back(var);
     args.desc.structure.push_back(D::Bool);
-  } else if (PyLong_Check(obj)) { // Wrap integers in long tensors
+  } else if (PyLong_Check(obj)) { // Wrap longs in Long tensors
     at::Tensor var = scalar_to_tensor(
         at::Scalar(static_cast<int64_t>(THPUtils_unpackLong(obj))));
     args.vars.push_back(var);
     args.desc.metadata.emplace_back(var);
     args.desc.structure.push_back(D::Long);
-  } else if (PyFloat_Check(obj)) { // Wrap floating points in double tensors
+  } else if (PyFloat_Check(obj)) { // Wrap floats in Double tensors
     at::Tensor var = scalar_to_tensor(THPUtils_unpackDouble(obj));
     args.vars.push_back(var);
     args.desc.metadata.emplace_back(var);
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index 0803bfbf92003..87792092061cf 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -447,7 +447,7 @@ void EncoderBase::EncodeBlock(
   // be a subset of graph inputs. We use keep_initializers_as_inputs
   // argument to determine whether to add initializers
   // as inputs or not. If keep_initializers_as_inputs=false,
-  // we only add non-parameter inputs as inputs to ONNX graph, and.
+  // we only add non-parameter inputs as inputs to ONNX graph, and
   // not the initializers (parameters). If keep_initializers_as_inputs
   // =true, we add initializers as inputs too. Setting
   // keep_initializers_as_inputs=false allows better
@@ -537,10 +537,10 @@ void EncoderBase::EncodeBlock(
     if (node->kind() == ::c10::onnx::If) {
       AT_ASSERT(node->blocks().size() == 2);
 
-      auto true_branch = p_n->add_attribute();
-      true_branch->set_name("then_branch");
-      true_branch->set_type(onnx::AttributeProto_AttributeType_GRAPH);
-      auto true_g = true_branch->mutable_g();
+      auto then_branch = p_n->add_attribute();
+      then_branch->set_name("then_branch");
+      then_branch->set_type(onnx::AttributeProto_AttributeType_GRAPH);
+      auto true_g = then_branch->mutable_g();
       EncodeBlock(
           true_g,
           node->blocks()[0],
@@ -551,10 +551,10 @@ void EncoderBase::EncodeBlock(
           use_external_data_format,
           onnx_file_path);
 
-      auto false_branch = p_n->add_attribute();
-      false_branch->set_name("else_branch");
-      false_branch->set_type(onnx::AttributeProto_AttributeType_GRAPH);
-      auto false_g = false_branch->mutable_g();
+      auto else_branch = p_n->add_attribute();
+      else_branch->set_name("else_branch");
+      else_branch->set_type(onnx::AttributeProto_AttributeType_GRAPH);
+      auto false_g = else_branch->mutable_g();
       EncodeBlock(
           false_g,
           node->blocks()[1],
@@ -697,7 +697,10 @@ void EncoderBase::AddAttribute(
       }
       break;
     default:
-      throw std::runtime_error("unexpected attribute kind");
+      std::ostringstream err_msg;
+      err_msg << "attribute \"" << name.toDisplayString()
+              << "\" has unexpected kind: " << toString(node->kindOf(name));
+      throw std::runtime_error(err_msg.str());
   }
 }
 
@@ -889,11 +892,6 @@ std::string pretty_print_onnx(
   return prettyPrint(graph_encoder.get_model_proto());
 }
 
-// export_raw_ir will export IR ops without turning them into ONNX ops.
-// The output will use the ONNX protobuf format, but the ops will not
-// conform to the ONNX op specification. Thus, the output will not
-// be interpretable by a ONNX-compatible framework. However, PyTorch or
-// libtorch will be able to import the IR and play it back.
 std::tuple<
     std::shared_ptr<::ONNX_NAMESPACE::ModelProto>,
     RawDataExportMap,

From b52909d861b28364db921b5d7518248d756313e1 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Fri, 9 Jul 2021 17:36:29 -0700
Subject: [PATCH 051/122] [TensorExpr] Add python bindings for ArgValue class
 and TensorExprKernel constructor accepting custom lowerings. (#61385)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61385

The bindings coverage might be not full yet, but this already allows us
to register custom lowerings from python.

Test Plan: Imported from OSS

Reviewed By: navahgar

Differential Revision: D29623487

Pulled By: ZolotukhinM

fbshipit-source-id: b97ee420a57fd887e204c021b9e098764b2ee232
---
 test/test_tensorexpr_pybind.py                | 35 +++++++++++++++++
 torch/csrc/jit/tensorexpr/tensorexpr_init.cpp | 38 +++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/test/test_tensorexpr_pybind.py b/test/test_tensorexpr_pybind.py
index cc4551515bb48..7ee1b4db26d50 100644
--- a/test/test_tensorexpr_pybind.py
+++ b/test/test_tensorexpr_pybind.py
@@ -333,6 +333,41 @@ def f(a):
         np.testing.assert_allclose(res1.numpy(), correct.numpy(), atol=2e-3)
         np.testing.assert_allclose(res2.numpy(), correct.numpy(), atol=2e-3)
 
+    @unittest.skipIf(not LLVM_ENABLED, "LLVM backend not enabled")
+    def test_kernel_with_custom_lowering(self):
+        def f(a):
+            return a.nan_to_num()
+
+        device = 'cpu'
+        x = torch.ones((2, 2), device=device)
+        x[0, 0] = x[1, 1] = torch.nan
+        graph_str = """
+graph(%x : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu)):
+    %none : NoneType = prim::Constant()
+    %y : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::nan_to_num(%x, %none, %none, %none)
+    return (%y)
+        """
+        graph = torch._C.parse_ir(graph_str)
+
+        def my_custom_lowering(inputs, out_shape, out_type, device):
+            def get_dim_args(dims):
+                dim_args = []
+                for dim in dims:
+                    dim_args.append(te.DimArg(dim, 'i' + str(len(dim_args))))
+                return dim_args
+
+            def compute(idxs):
+                load = inputs[0].as_buf().load(idxs)
+                return te.ifThenElse(te.ExprHandle.isnan(load), te.ExprHandle.float(0.), load)
+            return te.Compute2("custom_nan_to_num", get_dim_args(out_shape), compute)
+
+        kernel = torch._C._te.TensorExprKernel(graph, {'aten::nan_to_num' : my_custom_lowering})
+        res1 = kernel.run((x,))
+        res2 = kernel.fallback((x,))
+        correct = f(x)
+        np.testing.assert_allclose(res1.numpy(), correct.numpy(), atol=2e-3)
+        np.testing.assert_allclose(res2.numpy(), correct.numpy(), atol=2e-3)
+
     @unittest.skipIf(not LLVM_ENABLED, "LLVM backend not enabled")
     def test_kernel_with_expand(self):
         def f(a):
diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
index d1f49c02ed94f..4752436feec92 100644
--- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
+++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
@@ -643,9 +643,47 @@ void initTensorExprBindings(PyObject* module) {
             op, argInputs, outputShape, outputType.scalar_type());
       });
 
+  py::class_<ArgValue>(te, "ArgValue")
+      .def(py::init([](py::handle inp) {
+        return std::make_unique<ArgValue>(convertPyToArgValue(inp));
+      }))
+      .def(
+          "as_buf",
+          [](const ArgValue& self) { return c10::get<BufHandle>(self); })
+      .def(
+          "as_var",
+          [](const ArgValue& self) { return c10::get<VarHandle>(self); })
+      .def(
+          "as_float",
+          [](const ArgValue& self) { return c10::get<double>(self); })
+      .def(
+          "as_int",
+          [](const ArgValue& self) { return c10::get<int64_t>(self); })
+      .def("as_bool", [](const ArgValue& self) { return c10::get<bool>(self); })
+      .def(
+          "as_none",
+          [](const ArgValue& self) { return c10::get<ArgNone>(self); })
+      .def(
+          "as_buflist",
+          [](const ArgValue& self) { return c10::get<BufList>(self); })
+      .def("as_intlist", [](const ArgValue& self) {
+        return c10::get<IntList>(self);
+      });
+
+  py::class_<c10::ScalarType>(te, "ScalarType");
+
   using TSGraph = std::shared_ptr<Graph>;
   py::class_<TensorExprKernel>(te, "TensorExprKernel")
       .def(py::init<const TSGraph&>())
+      .def(py::init([](const TSGraph& g,
+                       std::unordered_map<std::string, NNCLoweringFunction>
+                           custom_lowerings_str) {
+        std::unordered_map<c10::Symbol, NNCLoweringFunction> custom_lowerings;
+        for (const auto& kv : custom_lowerings_str) {
+          custom_lowerings[c10::Symbol::fromQualString(kv.first)] = kv.second;
+        }
+        return std::make_unique<TensorExprKernel>(g, custom_lowerings);
+      }))
       .def(
           "run",
           [](TensorExprKernel& self, const py::tuple& inputs) {

From aaa1e07609b847cb8ec8d93cea1cb0ac51d32f4c Mon Sep 17 00:00:00 2001
From: Jamie King <jk0@fb.com>
Date: Fri, 9 Jul 2021 18:12:50 -0700
Subject: [PATCH 052/122] Smart Decay for Adam - Caffe2 (#61488)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61488

We want to decay learning parameters properly.  Previously this was not done when a parameter is absent from a minibatch.  We fix this by keeping track of missed minibatches and making decay catch up accordingly.

The exponential moving averages (EMA) for the first and second moments used in Adam are updated only for parameters seen in a minibatch.  Actually, for these parameters, 0 should be added to the EMAs and the EMAs should then be decayed by multiplying by beta1 and beta2 respectively.

To avoid the computational overhead of touching every parameter for every minibatch, we:
* keep track of the last time a parameter is seen
* instead of decaying the EMAs by multiplying by beta1 and beta2, we multiply by beta1^k and beta2^k, where k is the number of minibatches since the parameter was last seen.

Differential Revision: D27978269

fbshipit-source-id: e47524101ddfcb281c46c505b9b7a8f0835bc64a
---
 caffe2/python/operator_test/adam_test.py | 110 ++++++++++++++++++++-
 caffe2/sgd/adam_op.cc                    |  39 ++++++++
 caffe2/sgd/adam_op.h                     | 121 +++++++++++++++++++++++
 3 files changed, 266 insertions(+), 4 deletions(-)

diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py
index 60d39d831a93e..5de272b93db29 100644
--- a/caffe2/python/operator_test/adam_test.py
+++ b/caffe2/python/operator_test/adam_test.py
@@ -32,6 +32,37 @@ def ref_adam(param, mom1, mom2, grad, LR, ITER,
         else:
             return param_out, mom1_out, mom2_out
 
+    @staticmethod
+    def ref_smart_decay_adam(param, mom1, mom2, last_seen, grad, LR, ITER,
+                             beta1, beta2, epsilon):
+        t = ITER + 1
+
+        k = int(np.array(t - last_seen).flatten()[0])
+        last_seen_out = t
+
+        if beta1 == 0.0:
+            mom1_out = grad
+            mom2_out = (beta2**k * mom2) + (1 - beta2) * np.square(grad)
+            grad_out = mom1_out / (np.sqrt(mom2_out) + epsilon)
+            param_out = param + LR * grad_out
+            return param_out, mom1_out, mom2_out, last_seen_out
+
+        # Make up for lost minibatches.
+        else:
+            mom2_out = (beta2**k * mom2) + (1 - beta2) * np.square(grad)
+            p_out = param
+            m = mom1
+            # For catchup
+            for i in range(k - 1):
+                m *= beta1
+                update = m / (np.sqrt(mom2_out) + epsilon)
+                p_out += LR * update
+            # For the single step update
+            mom1_out = m * beta1 + grad * (1 - beta1)
+            grad_out = mom1_out / (np.sqrt(mom2_out) + epsilon)
+            param_out = p_out + LR * grad_out
+            return param_out, mom1_out, mom2_out, last_seen_out
+
     @staticmethod
     def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER,
                           beta1, beta2, epsilon, output_grad=False):
@@ -181,6 +212,77 @@ def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER):
             ref_sparse,
             input_device_options=input_device_options)
 
+    @given(inputs=hu.tensors(n=4),
+           ITER=st.integers(min_value=0, max_value=10000),
+           LR=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           beta1=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           beta2=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           data_strategy=st.data(),
+           **hu.gcs)
+    def test_smart_decay_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
+                                     data_strategy, gc, dc):
+        param, mom1, mom2, grad = inputs
+
+        mom2 = np.absolute(mom2)
+        ITER = np.array([ITER], dtype=np.int64)
+        # Here we will define the last_seen tensor as being randomly from 0 to ITER
+        # (the value of t to be tested will be ITER+1)
+        last_seen = np.random.randint(low=0, high=ITER + 1, size=param.shape, dtype=np.int64)
+        LR = np.array([LR], dtype=np.float32)
+
+        # Create an indexing array containing values which index into grad
+        indices = data_strategy.draw(
+            hu.tensor(
+                max_dim=1,
+                min_value=1,
+                max_value=grad.shape[0],
+                dtype=np.int64,
+                elements=st.sampled_from(np.arange(grad.shape[0])),
+            ),
+        )
+
+        # Verify that the generated indices are unique
+        hypothesis.assume(
+            np.array_equal(
+                np.unique(indices.flatten()),
+                np.sort(indices.flatten())))
+
+        # Sparsify grad
+        grad = grad[indices]
+
+        op = core.CreateOperator(
+            "SmartDecaySparseAdam",
+            ["param", "mom1", "mom2", "last_seen", "indices", "grad", "lr", "iter"],
+            ["param", "mom1", "mom2", "last_seen"],
+            beta1=beta1, beta2=beta2, epsilon=epsilon)
+
+        def ref_sparse(param, mom1, mom2, last_seen, indices, grad, LR, ITER):
+            param_out = np.copy(param)
+            mom1_out = np.copy(mom1)
+            mom2_out = np.copy(mom2)
+            last_seen_out = np.copy(last_seen)
+
+            for i, index in enumerate(indices):
+                param_out[index], mom1_out[index], mom2_out[index], last_seen_out[index] = \
+                    self.ref_smart_decay_adam(param[index], mom1[index], mom2[index], last_seen[index],
+                                              grad[i], LR, ITER,
+                                              beta1, beta2, epsilon)
+            return (param_out, mom1_out, mom2_out, last_seen_out)
+
+        # Iter lives on the CPU
+        input_device_options = {'iter': hu.cpu_do}
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, mom1, mom2, last_seen, indices, grad, LR, ITER],
+            ref_sparse,
+            input_device_options=input_device_options)
+
     @given(inputs=hu.tensors(n=4),
            ITER=st.integers(min_value=0, max_value=10000),
            LR=st.floats(min_value=0.01, max_value=0.99,
@@ -194,7 +296,7 @@ def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER):
            data_strategy=st.data(),
            **hu.gcs)
     def test_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon,
-                         data_strategy, gc, dc):
+                                     data_strategy, gc, dc):
         param, mom1, mom2, grad = inputs
         mom2 = np.absolute(mom2)
         ITER = np.array([ITER], dtype=np.int64)
@@ -227,7 +329,7 @@ def test_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon,
             beta1=beta1, beta2=beta2, epsilon=epsilon)
 
         def ref_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
-                                beta1, beta2, epsilon, output_grad):
+                                   beta1, beta2, epsilon, output_grad):
             param_out = np.copy(param)
             mom1_out = np.copy(mom1)
             mom2_out = np.copy(mom2)
@@ -346,7 +448,7 @@ def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER):
            data_strategy=st.data(),
            **hu.gcs)
     def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2,
-                                  epsilon, data_strategy, gc, dc):
+                                              epsilon, data_strategy, gc, dc):
         param, mom1, grad = inputs
         ITER = np.array([ITER], dtype=np.int64)
         LR = np.array([LR], dtype=np.float32)
@@ -390,7 +492,7 @@ def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2,
             beta1=beta1, beta2=beta2, epsilon=epsilon)
 
         def ref_row_wise_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
-                                        beta1, beta2, epsilon, output_grad):
+                                            beta1, beta2, epsilon, output_grad):
             param_out = np.copy(param)
             mom1_out = np.copy(mom1)
             mom2_out = np.copy(mom2)
diff --git a/caffe2/sgd/adam_op.cc b/caffe2/sgd/adam_op.cc
index dd74a937c9c19..7e14e3d07df6b 100644
--- a/caffe2/sgd/adam_op.cc
+++ b/caffe2/sgd/adam_op.cc
@@ -91,6 +91,45 @@ OPERATOR_SCHEMA(SparseAdam)
     .Arg("epsilon", "Default 1e-5")
     .Arg("enableRAdam", "Default false");
 
+REGISTER_CPU_OPERATOR(SmartDecaySparseAdam, SmartDecaySparseAdamOp<float, CPUContext>);
+OPERATOR_SCHEMA(SmartDecaySparseAdam)
+    .NumInputs(8)
+    .NumOutputs(4)
+    .EnforceInplace({{0, 0}, {1, 1}, {2, 2}, {3, 3}})
+    .DeviceInferenceFunction([](const OperatorDef& def) {
+      auto op_device =
+          def.has_device_option() ? def.device_option() : DeviceOption();
+      vector<DeviceOption> in_dev(def.input_size(), op_device);
+      vector<DeviceOption> out_dev(def.output_size(), op_device);
+      // ITER input lives on CPU
+      in_dev[7] = DeviceOption();
+      return std::make_pair(in_dev, out_dev);
+    })
+    .SetDoc(R"DOC(
+
+    Computes the Adam Update for the sparse case.
+    Given inputs (param, moment1, moment2, indices, grad, lr, iter), runs the dense
+    Adam on (param, moment1[indices], momemnt2[indices], lr, iter) and returns
+    (new_param, new_moment1, new_moment2) as in dense case.
+    Adam can be customized as Rectified Adam (RAdam) by setting enableRAdam = true.
+
+    )DOC")
+    .Input(0, "param", "Parameters to be updated")
+    .Input(1, "moment_1", "First moment history")
+    .Input(2, "moment_2", "Second moment history")
+    .Input(3, "last_seen", "Minibatch index when each weight was last seen")
+    .Input(4, "indices", "Sparse indices")
+    .Input(5, "grad", "Gradient computed")
+    .Input(6, "lr", "learning rate")
+    .Input(7, "iter", "iteration number")
+    .Output(0, "output_param", "Updated parameters")
+    .Output(1, "output_moment_1", "Updated first moment")
+    .Output(2, "output_moment_2", "Updated second moment")
+    .Output(3, "output_last_seen", "Updated minibatch index when each weight was last seen")
+    .Arg("beta1", "Default 0.9")
+    .Arg("beta2", "Default 0.999")
+    .Arg("epsilon", "Default 1e-5");
+
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 REGISTER_CPU_OPERATOR(
     RowWiseSparseAdam,
diff --git a/caffe2/sgd/adam_op.h b/caffe2/sgd/adam_op.h
index c7a5db65db252..4e1b6e345f359 100644
--- a/caffe2/sgd/adam_op.h
+++ b/caffe2/sgd/adam_op.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
 
 namespace caffe2 {
 
@@ -52,6 +53,42 @@ void adam_compute(
   }
 }
 
+template <typename Context>
+void adam_compute_smart_decay(
+    int N,
+    long int t,
+    const float* w,
+    const float* g,
+    const float* m,
+    const float* v,
+    const int64_t* lastSeenIn,
+    float* nw,
+    float* nm,
+    float* nv,
+    int64_t* lastSeenOut,
+    float beta1,
+    float beta2,
+    float eps_hat,
+    //float correction,
+    const float* lr,
+    Context* /*context*/) {
+  for (auto i = 0; i < N; ++i) {
+    float gi = g[i];
+    // The number of steps since this param was last seen.
+    long int k = t - lastSeenIn[i];
+    lastSeenOut[i] = t;
+    // Same as sparse Adam except v is decayed by beta2^k rather than beta2
+    // Catchup = \sum_{i=1}^{k-1}\beta_1^i = \beta_1 \left(\frac{1-\beta_1^k}{1-\beta_1}\right)
+    float catchup = 0.0;
+    if (k > 1) {
+        catchup = m[i] * beta1 * (1 - std::pow(beta1, k)) / (1 - beta1);
+    }
+    float mi = nm[i] = m[i] * std::pow(beta1, k) + gi * (1 - beta1);
+    float vi = nv[i] = v[i] * std::pow(beta2, k) + gi * gi * (1 - beta2);
+    nw[i] = w[i] + (lr[0] * (mi + catchup)) / (std::sqrt(vi) + eps_hat);
+  }
+}
+
 template <typename Context>
 void adam_compute_output_grad(
     int N,
@@ -509,6 +546,90 @@ class SparseAdamOp final : public Operator<Context> {
   OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_GRAD);
 };
 
+template <typename T, class Context>
+class SmartDecaySparseAdamOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SmartDecaySparseAdamOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        beta1_(this->template GetSingleArgument<float>("beta1", 0.9f)),
+        beta2_(this->template GetSingleArgument<float>("beta2", 0.999f)),
+        epsilon_(this->template GetSingleArgument<float>("epsilon", 1e-5f)) {}
+
+  bool RunOnDevice() override {
+    // Enforce shapes
+    CAFFE_ENFORCE_EQ(Input(PARAM).numel(), Input(MOMENT_1).numel());
+    CAFFE_ENFORCE_EQ(Input(PARAM).numel(), Input(MOMENT_2).numel());
+    CAFFE_ENFORCE_EQ(Input(PARAM).numel(), Input(LAST_SEEN).numel());
+    CAFFE_ENFORCE_EQ(
+        Input(PARAM).size_from_dim(1),
+        Input(GRAD).size_from_dim(Input(INDICES).dim()));
+    CAFFE_ENFORCE_EQ(Input(LR).numel(), 1);
+
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, Input(INDICES));
+  }
+
+  template <typename SIndex>
+  bool DoRunWithType() {
+    const auto* lr = Input(LR).template data<T>();
+    const auto iter =
+        OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
+
+    const int64_t t = iter + 1;
+    //const auto beta1_correction = T(1.) / (T(1.) - std::pow(beta1_, t));
+    //const auto beta2_correction =
+    //    T(1.) / std::sqrt(T(1.) - std::pow(beta2_, t));
+    //const auto correction = beta1_correction / beta2_correction;
+
+    auto block_size = Input(PARAM).numel() / Input(PARAM).size(0);
+    auto n = Input(GRAD).numel() / block_size;
+
+    const auto* paramIn = Input(PARAM).template data<T>();
+    const auto* indices = Input(INDICES).template data<SIndex>();
+    const auto* gradIn = Input(GRAD).template data<T>();
+    const auto* moment1In = Input(MOMENT_1).template data<T>();
+    const auto* moment2In = Input(MOMENT_2).template data<T>();
+    const int64_t* lastSeenIn = Input(LAST_SEEN).template data<int64_t>();
+    auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<T>();
+    auto* moment1Out = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
+    auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
+    int64_t* lastSeenOut = Output(OUTPUT_LAST_SEEN)->template mutable_data<int64_t>();
+
+    for (auto i = 0; i < n; ++i) {
+        auto idx = indices[i];
+        auto offsetI = i * block_size;
+        auto offsetIdx = idx * block_size;
+        adam_compute_smart_decay(
+            block_size,
+            t,
+            paramIn + offsetIdx,
+            gradIn + offsetI,
+            moment1In + offsetIdx,
+            moment2In + offsetIdx,
+            lastSeenIn + offsetIdx,
+            paramOut + offsetIdx,
+            moment1Out + offsetIdx,
+            moment2Out + offsetIdx,
+            lastSeenOut + offsetIdx,
+            beta1_,
+            beta2_,
+            epsilon_,
+            lr,
+            &context_);
+    }
+
+    return true;
+  }
+
+ protected:
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+  INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, LAST_SEEN, INDICES, GRAD, LR, ITER);
+  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_LAST_SEEN);
+};
+
 template <typename T, class Context>
 class RowWiseSparseAdamOp final : public Operator<Context> {
  public:

From 9b2b45919a44641489647f4588568bcc3638a04d Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Fri, 9 Jul 2021 18:38:31 -0700
Subject: [PATCH 053/122] Revert D29639797: [package] error if we try to mock a
 module in 3.6

Test Plan: revert-hammer

Differential Revision:
D29639797

Original commit changeset: 775ed78638fb

fbshipit-source-id: 9d2f6dae7ee35c6b37338e36ec7ade9d9e2ccbc2
---
 test/package/test_dependency_api.py | 9 ---------
 torch/package/package_exporter.py   | 6 ------
 2 files changed, 15 deletions(-)

diff --git a/test/package/test_dependency_api.py b/test/package/test_dependency_api.py
index a42279c807df5..d9ff435de57d5 100644
--- a/test/package/test_dependency_api.py
+++ b/test/package/test_dependency_api.py
@@ -130,15 +130,6 @@ def test_mock(self):
         with self.assertRaisesRegex(NotImplementedError, "was mocked out"):
             r()
 
-    @skipIf(version_info > (3, 6), "tests specific 3.6 behavior")
-    def test_mock_36_error(self):
-        """Test that an error is properly thrown when we attempt to mock a
-        module in Python 3.6.
-        """
-        with self.assertRaisesRegex(RuntimeError, "upgrade your Python"):
-            with PackageExporter(BytesIO()) as exporter:
-                exporter.mock(["package_a"])
-
     @skipIf(version_info < (3, 7), "mock uses __getattr__ a 3.7 feature")
     def test_mock_glob(self):
         buffer = BytesIO()
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 4f047fedce048..0a88b75245ef3 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -8,7 +8,6 @@
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
-from sys import version_info
 from typing import (
     Any,
     BinaryIO,
@@ -720,11 +719,6 @@ def mock(
                 If ``allow_empty=True``, no such exception is thrown.
 
         """
-        if version_info < (3, 7):
-            raise RuntimeError(
-                "Python 3.7 or higher is required to mock out modules. "
-                "Please upgrade your Python version."
-            )
         self.patterns[GlobGroup(include, exclude=exclude)] = _PatternInfo(
             _ModuleProviderAction.MOCK, allow_empty
         )

From 06166a13e0a03e710d03280d411b553f0316fb3a Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Fri, 9 Jul 2021 19:21:17 -0700
Subject: [PATCH 054/122] Remove VS install step unless necessary from GHA
 Windows workflows (#60791)

Summary:
~~This should only be merged after our AMI has been deployed after https://github.com/fairinternal/pytorch-gha-infra/pull/1. (And will likely fail our current windows jobs)~~

I have revised this PR to install VS only when it's not already installed.

This should save ~5min per Windows workflow.
![image](https://user-images.githubusercontent.com/31798555/125141598-7e886c80-e0e3-11eb-9fe0-bb9e6bcc14f1.png)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/60791

Reviewed By: soulitzer

Differential Revision: D29643876

Pulled By: janeyx99

fbshipit-source-id: 4bcfaf5bcad9e5636a1624c3e799e7cc97a87660
---
 .circleci/scripts/vs_install.ps1              | 45 +++++++++++--------
 .github/templates/windows_ci_workflow.yml.j2  |  1 +
 ...c-pytorch-win-vs2019-cuda11-cudnn8-py3.yml |  1 +
 .../workflows/pytorch-win-vs2019-cpu-py3.yml  |  1 +
 .../pytorch-win-vs2019-cuda10-cudnn7-py3.yml  |  1 +
 .../pytorch-win-vs2019-cuda11-cudnn8-py3.yml  |  1 +
 6 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/.circleci/scripts/vs_install.ps1 b/.circleci/scripts/vs_install.ps1
index 95b4aa7ead2fd..8b3886347531c 100644
--- a/.circleci/scripts/vs_install.ps1
+++ b/.circleci/scripts/vs_install.ps1
@@ -1,8 +1,8 @@
 # https://developercommunity.visualstudio.com/t/install-specific-version-of-vs-component/1142479
-# https://docs.microsoft.com/en-us/visualstudio/releases/2019/history#release-dates-and-build-numbers
+# Where to find the links: https://docs.microsoft.com/en-us/visualstudio/releases/2019/history#release-dates-and-build-numbers
 
-# 16.8.6 BuildTools
-$VS_DOWNLOAD_LINK = "https://s3.amazonaws.com/ossci-windows/vs16.8.6_BuildTools.exe"
+# BuildTools from S3
+$VS_DOWNLOAD_LINK = "https://s3.amazonaws.com/ossci-windows/vs${env:VS_VERSION}_BuildTools.exe"
 $COLLECT_DOWNLOAD_LINK = "https://aka.ms/vscollect.exe"
 $VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
                                                      "--add Microsoft.Component.MSBuild",
@@ -18,32 +18,41 @@ if (${env:INSTALL_WINDOWS_SDK} -eq "1") {
     $VS_INSTALL_ARGS += "--add Microsoft.VisualStudio.Component.Windows10SDK.19041"
 }
 
+if (Test-Path "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe") {
+    $VS_VERSION_major = [int] ${env:VS_VERSION}.split(".")[0]
+    $existingPath = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -products "Microsoft.VisualStudio.Product.BuildTools" -version "[${env:VS_VERSION}, ${env:VS_VERSION_major + 1})" -property installationPath
+    if ($existingPath -ne $null) {
+        echo "Found correctly versioned existing BuildTools installation in $existingPath"
+        exit 0
+    }
+    $pathToRemove = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -products "Microsoft.VisualStudio.Product.BuildTools" -property installationPath
+}
+
+echo "Downloading VS installer from S3."
 curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
 if ($LASTEXITCODE -ne 0) {
-    echo "Download of the VS 2019 Version 16.8.6 installer failed"
+    echo "Download of the VS 2019 Version ${env:VS_VERSION} installer failed"
     exit 1
 }
 
-if (Test-Path "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe") {
-    $existingPath = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -products "Microsoft.VisualStudio.Product.BuildTools" -version "[16, 17)" -property installationPath
-    if ($existingPath -ne $null) {
-        echo "Found existing BuildTools installation in $existingPath"
-        $VS_UNINSTALL_ARGS = @("uninstall", "--installPath", "`"$existingPath`"", "--quiet","--wait")
-        $process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_UNINSTALL_ARGS -NoNewWindow -Wait -PassThru
-        $exitCode = $process.ExitCode
-        if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
-            echo "Original BuildTools uninstall failed with code $exitCode"
-            exit 1
-        }
-        echo "Original BuildTools uninstalled"
+if ($pathToRemove -ne $null) {
+    echo "Uninstalling $pathToRemove."
+    $VS_UNINSTALL_ARGS = @("uninstall", "--installPath", "`"$pathToRemove`"", "--quiet","--wait")
+    $process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_UNINSTALL_ARGS -NoNewWindow -Wait -PassThru
+    $exitCode = $process.ExitCode
+    if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
+        echo "Original BuildTools uninstall failed with code $exitCode"
+        exit 1
     }
+    echo "Other versioned BuildTools uninstalled."
 }
 
+echo "Installing Visual Studio version ${env:VS_VERSION}."
 $process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru
 Remove-Item -Path vs_installer.exe -Force
 $exitCode = $process.ExitCode
 if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
-    echo "VS 2017 installer exited with code $exitCode, which should be one of [0, 3010]."
+    echo "VS 2019 installer exited with code $exitCode, which should be one of [0, 3010]."
     curl.exe --retry 3 -kL $COLLECT_DOWNLOAD_LINK --output Collect.exe
     if ($LASTEXITCODE -ne 0) {
         echo "Download of the VS Collect tool failed."
@@ -51,6 +60,6 @@ if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
     }
     Start-Process "${PWD}\Collect.exe" -NoNewWindow -Wait -PassThru
     New-Item -Path "C:\w\build-results" -ItemType "directory" -Force
-    Copy-Item -Path "C:\Users\circleci\AppData\Local\Temp\vslogs.zip" -Destination "C:\w\build-results\"
+    Copy-Item -Path "${env:TEMP}\vslogs.zip" -Destination "C:\w\build-results\"
     exit 1
 }
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index 8414b0f7720bb..bfd28439f9b86 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -27,6 +27,7 @@ env:
   SCCACHE_BUCKET: "ossci-compiler-cache"
   VC_PRODUCT: "BuildTools"
   VC_VERSION: ""
+  VS_VERSION: "16.8.6"
   VC_YEAR: "2019"
 {%- if cuda_version != "cpu" %}
   TORCH_CUDA_ARCH_LIST: "7.0"
diff --git a/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml b/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml
index 9f498f1923070..223a8894268d6 100644
--- a/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml
+++ b/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml
@@ -18,6 +18,7 @@ env:
   SCCACHE_BUCKET: "ossci-compiler-cache"
   VC_PRODUCT: "BuildTools"
   VC_VERSION: ""
+  VS_VERSION: "16.8.6"
   VC_YEAR: "2019"
   TORCH_CUDA_ARCH_LIST: "7.0"
   USE_CUDA: 1
diff --git a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
index bf91d5fb78bd3..df17edc1cd753 100644
--- a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
@@ -21,6 +21,7 @@ env:
   SCCACHE_BUCKET: "ossci-compiler-cache"
   VC_PRODUCT: "BuildTools"
   VC_VERSION: ""
+  VS_VERSION: "16.8.6"
   VC_YEAR: "2019"
 
 concurrency:
diff --git a/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml b/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
index 44ddd3248b9ca..472e05bbe06ad 100644
--- a/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
@@ -21,6 +21,7 @@ env:
   SCCACHE_BUCKET: "ossci-compiler-cache"
   VC_PRODUCT: "BuildTools"
   VC_VERSION: ""
+  VS_VERSION: "16.8.6"
   VC_YEAR: "2019"
   TORCH_CUDA_ARCH_LIST: "7.0"
   USE_CUDA: 1
diff --git a/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml b/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
index 63519be3ec516..3c906a8d7d9d6 100644
--- a/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
@@ -20,6 +20,7 @@ env:
   SCCACHE_BUCKET: "ossci-compiler-cache"
   VC_PRODUCT: "BuildTools"
   VC_VERSION: ""
+  VS_VERSION: "16.8.6"
   VC_YEAR: "2019"
   TORCH_CUDA_ARCH_LIST: "7.0"
   USE_CUDA: 1

From a556c1c4dcc2df0dc72f6385ef5d38e911a5e0f9 Mon Sep 17 00:00:00 2001
From: Ilia Cherniavskii <iliacher@fb.com>
Date: Fri, 9 Jul 2021 19:24:38 -0700
Subject: [PATCH 055/122] [profiler] Update Kineto submodule (ci-all) (#61478)

Summary:
Update Kineto submodule

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61478

Test Plan:
CI

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61432

Reviewed By: gdankel

Differential Revision: D29646019

Pulled By: ilia-cher

fbshipit-source-id: 02ecb0a2a6b457f6537c7d6b3c475e1e0ace3b6f
---
 third_party/kineto                      |  2 +-
 torch/csrc/autograd/profiler_kineto.cpp | 27 ++-----------------------
 2 files changed, 3 insertions(+), 26 deletions(-)

diff --git a/third_party/kineto b/third_party/kineto
index 540289cfc9079..dbfa0ead96612 160000
--- a/third_party/kineto
+++ b/third_party/kineto
@@ -1 +1 @@
-Subproject commit 540289cfc9079e774d4e72b6888589b3132c799b
+Subproject commit dbfa0ead96612f7ca265c63a35fdf0488395179b
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index cd4980063fa6c..d2cdac5a8596c 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -58,7 +58,6 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState {
       return;
     }
 
-#ifdef USE_KINETO_UPDATED
     libkineto::GenericTraceActivity op(
         cpu_trace->span,
         libkineto::ActivityType::CPU_OP,
@@ -66,14 +65,6 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState {
     op.device = libkineto::processId();
     op.resource = libkineto::systemThreadId();
     op.id = ctx->correlationId;
-#else
-    libkineto::GenericTraceActivity op;
-    op.activityType = libkineto::ActivityType::CPU_OP;
-    op.activityName = std::string(fn.name().str());
-    op.device = libkineto::processId();
-    op.sysThreadId = libkineto::systemThreadId();
-    op.correlation = ctx->correlationId;
-#endif
     op.startTime = ctx->startUs;
     op.endTime = getTimeUs();
     // optimization - postpone shapesToStr till finalizeCPUTrace
@@ -132,7 +123,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState {
       {
         std::lock_guard<std::mutex> guard(state_mutex_);
         libkineto::api().activityProfiler().recordThreadInfo();
-#ifdef USE_KINETO_UPDATED
+
         memory_events_.emplace_back(
             cpu_trace->span,
             libkineto::ActivityType::CPU_INSTANT_EVENT,
@@ -140,13 +131,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState {
         auto& act = memory_events_.back();
         act.device = libkineto::processId();
         act.resource = libkineto::systemThreadId();
-#else
-        memory_events_.emplace_back();
-        auto& act = memory_events_.back();
-        act.activityType = libkineto::ActivityType::CPU_INSTANT_EVENT;
-        act.activityName = "[memory]";
-        act.sysThreadId = libkineto::systemThreadId();
-#endif
+
         act.startTime = getTimeUs();
         act.addMetadata("Device Type", std::to_string((int8_t)device.type()));
         act.addMetadata("Device Id", std::to_string(device.index()));
@@ -161,9 +146,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState {
       // These events are already processed
       if (ev_ptr->type() != libkineto::ActivityType::CPU_OP &&
           ev_ptr->type() != libkineto::ActivityType::CPU_INSTANT_EVENT
-#ifdef USE_KINETO_UPDATED
           && ev_ptr->type() != libkineto::ActivityType::USER_ANNOTATION
-#endif
       ) {
         kineto_events_.emplace_back();
         kineto_events_.back()
@@ -396,9 +379,7 @@ void prepareProfiler(
   std::set<libkineto::ActivityType> cpuTypes = {
     libkineto::ActivityType::CPU_OP,
     libkineto::ActivityType::CPU_INSTANT_EVENT,
-#ifdef USE_KINETO_UPDATED
     libkineto::ActivityType::USER_ANNOTATION,
-#endif
     libkineto::ActivityType::EXTERNAL_CORRELATION,
     libkineto::ActivityType::CUDA_RUNTIME,
   };
@@ -531,14 +512,10 @@ c10::DeviceType KinetoEvent::deviceType() const {
     case (uint8_t)libkineto::ActivityType::GPU_MEMCPY:
     case (uint8_t)libkineto::ActivityType::GPU_MEMSET:
     case (uint8_t)libkineto::ActivityType::CONCURRENT_KERNEL:
-#ifdef USE_KINETO_UPDATED
     case (uint8_t)libkineto::ActivityType::GPU_USER_ANNOTATION:
-#endif
       return c10::DeviceType::CUDA;
     case (uint8_t)libkineto::ActivityType::CPU_OP:
-#ifdef USE_KINETO_UPDATED
     case (uint8_t)libkineto::ActivityType::USER_ANNOTATION:
-#endif
     case (uint8_t)libkineto::ActivityType::EXTERNAL_CORRELATION:
     case (uint8_t)libkineto::ActivityType::CUDA_RUNTIME:
     case (uint8_t)libkineto::ActivityType::CPU_INSTANT_EVENT:

From 677313b6705cd3a1361294675beb5dab61bf1ddc Mon Sep 17 00:00:00 2001
From: Zeina Migeed <migeedz@fb.com>
Date: Fri, 9 Jul 2021 19:24:43 -0700
Subject: [PATCH 056/122] ReLU (#61150)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61150

Test Plan: Imported from OSS

Reviewed By: jamesr66a

Differential Revision: D29625826

Pulled By: migeed-z

fbshipit-source-id: 10e0662e33ccd4342cedd51579a10651755b633f
---
 test/fx/test_gradual_type.py                  | 55 +++++++++++++++++++
 .../experimental/graph_gradual_typechecker.py | 18 +++++-
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/test/fx/test_gradual_type.py b/test/fx/test_gradual_type.py
index 813fab259632d..9946303548b2e 100644
--- a/test/fx/test_gradual_type.py
+++ b/test/fx/test_gradual_type.py
@@ -408,6 +408,7 @@ def forward(self, x: TensorType((5, 2, 3, 4))):
         with self.assertRaises(TypeError):
             tc.type_check()
 
+
     def test_type_check_conv2D_2_fully_static(self):
         annotation_list = [(1, 2, 3, 5), (2, 5, 6, 9), (10, 15, 13, 14),
                            (10, Dyn, 13, 14), (Dyn, Dyn, Dyn, 3)]
@@ -496,5 +497,59 @@ def forward(self, x):
                     assert is_consistent(n.type, TensorType(b.size()))
 
 
+    def test_typecheck_basicblock(self):
+        class BasicBlock(torch.nn.Module):
+            expansion = 1
+
+            def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                         base_width=64, dilation=1, norm_layer=None):
+                super(BasicBlock, self).__init__()
+                if norm_layer is None:
+                    norm_layer = torch.nn.BatchNorm2d
+                if groups != 1 or base_width != 64:
+                    raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+                if dilation > 1:
+                    raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+                # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+                self.conv1 = conv3x3(inplanes, planes, stride)
+                self.bn1 = norm_layer(planes)
+                self.relu = torch.nn.ReLU(inplace=True)
+                self.conv2 = conv3x3(planes, planes)
+                self.bn2 = norm_layer(planes)
+                self.downsample = downsample
+                self.stride = stride
+
+            def forward(self, x: TensorType((2, 2, 4, 5))):
+                identity = x
+
+                out = self.conv1(x)
+                out = self.bn1(out)
+                out = self.relu(out)
+
+                out = self.conv2(out)
+                out = self.bn2(out)
+
+                if self.downsample is not None:
+                    identity = self.downsample(x)
+
+                out += identity
+                out = self.relu(out)
+
+                return out
+
+        B = BasicBlock(2, 2)
+
+        ast_rewriter = RewritingTracer()
+        graph = ast_rewriter.trace(B)
+        traced = GraphModule(ast_rewriter.root, graph, "gm")
+
+        tc = GraphTypeChecker({}, traced)
+        tc.type_check()
+
+        for n in traced.graph.nodes:
+            if n.target == 'output':
+                assert isinstance(n.type, TensorType)
+                assert torch.Size(n.type.__args__) == B.forward(torch.rand(2, 2, 4, 5)).size()
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py
index 073514aed0882..d505230198501 100644
--- a/torch/fx/experimental/graph_gradual_typechecker.py
+++ b/torch/fx/experimental/graph_gradual_typechecker.py
@@ -202,8 +202,7 @@ def bn2d_inference_rule(n: Node, module_instance):
         # to be the node type
         # so if an incoming argument has more type information
         # we set this node's type to be the argument type
-        if is_more_precise(arg_type, n.type):
-            n.type = arg_type
+        n.type = get_greatest_upper_bound(arg_type, n.type)
         return n.type
     else:
         raise TypeError(f'Cannot apply {module_instance} with input type {arg_type} and existing type {n.type} on {n}')
@@ -267,6 +266,21 @@ def conv2d_inference_rule(n: Node, module_instance):
     else:
         raise TypeError(f'Cannot apply {module_instance} with input type { arg_type} and existing type {n.type} on {n}')
 
+
+@register_inference_rule(torch.nn.ReLU)
+def relu_inference_rule(n: Node, module_instance):
+    """
+    Input and output shapes should be equal.
+    """
+    assert isinstance(n.args[0], Node)
+
+    if n.args[0].type == Dyn and isinstance(n.type, TensorType):
+        n.args[0].type = expand_to_tensor_dim(n.args[0].type, len(n.type.__args__))
+
+    if isinstance(n.args[0].type, TensorType):
+        n.type = get_greatest_upper_bound(n.args[0].type, n.type)
+    return n.type
+
 class GraphTypeChecker:
     def __init__(self, env, traced):
         self.env = env

From 73b86c9f9c920b66c7c05f4734a75cc9346ab3fa Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Fri, 9 Jul 2021 21:41:26 -0700
Subject: [PATCH 057/122] Add getMethod to PytorchPredictorContainer (#61052)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61052

Implement getMethod in the container in a similar way to getPredictor,
using either Deploy or Script functionality depending on how the container
was initialized and how the gflag deploy override are set.

Test Plan: Add new unit test

Reviewed By: houseroad

Differential Revision: D29346969

fbshipit-source-id: 08e95ee96d533f5a7cc9c8f9b1c53751715c9181
---
 torch/csrc/deploy/deploy.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/deploy/deploy.h b/torch/csrc/deploy/deploy.h
index 108e1080f818f..3a7cfd1060715 100644
--- a/torch/csrc/deploy/deploy.h
+++ b/torch/csrc/deploy/deploy.h
@@ -239,8 +239,9 @@ class PythonMethodWrapper : public torch::IMethod {
   // ReplicatedObj which represents a python method, and
   // is therefore callable and has argument names accessible.
  public:
+  // TODO(whc) make bound method pickleable, then directly construct from that
   PythonMethodWrapper(
-      torch::deploy::ReplicatedObj& model,
+      torch::deploy::ReplicatedObj model,
       std::string method_name)
       : model_(std::move(model)), method_name_(std::move(method_name)) {}
 

From 336970c03e6b19ecb42d856f0463d769c7a83f3d Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Sat, 10 Jul 2021 03:49:40 -0700
Subject: [PATCH 058/122] Add note on torch.distributed backends on ROCm
 (#58975)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/58975

Reviewed By: soulitzer

Differential Revision: D29595510

Pulled By: rohan-varma

fbshipit-source-id: 384bb67fcd003d65b76e957a474406b2a38099b9
---
 docs/source/notes/hip.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/source/notes/hip.rst b/docs/source/notes/hip.rst
index 8906f6402935b..20f99cb96c5b0 100644
--- a/docs/source/notes/hip.rst
+++ b/docs/source/notes/hip.rst
@@ -112,6 +112,13 @@ hipFFT/rocFFT plan cache
 
 Setting the size of the cache for hipFFT/rocFFT plans is not supported.
 
+.. _torch-distributed-backends:
+
+torch.distributed backends
+--------------------------
+
+Currently, only the "nccl" and "gloo" backends for torch.distributed are supported on ROCm.
+
 Refer to CUDA Semantics doc
 ---------------------------
 

From 8bcf24b37af41dc0f25fcde4e98bab55493f1f45 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Sat, 10 Jul 2021 03:52:05 -0700
Subject: [PATCH 059/122] [TCPStore] enhance connect timeout error message
 (#61390)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61390

Enhances this error message for better debugability.
ghstack-source-id: 133185482

Test Plan: CI

Reviewed By: H-Huang

Differential Revision: D29601528

fbshipit-source-id: f7aaf4d67ac96e6ed0b535e0200f918dd01e42f9
---
 torch/csrc/distributed/c10d/UnixSockUtils.hpp  | 10 +++++++++-
 torch/csrc/distributed/c10d/Utils.cpp          | 10 +++++++---
 torch/csrc/distributed/c10d/WinSockUtils.hpp   |  8 +++++++-
 torch/distributed/elastic/utils/distributed.py |  2 +-
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/torch/csrc/distributed/c10d/UnixSockUtils.hpp b/torch/csrc/distributed/c10d/UnixSockUtils.hpp
index b75bddb763787..07862806a0035 100644
--- a/torch/csrc/distributed/c10d/UnixSockUtils.hpp
+++ b/torch/csrc/distributed/c10d/UnixSockUtils.hpp
@@ -56,7 +56,15 @@ inline void waitSocketConnected(
     throw std::system_error(errno, std::system_category());
   } else if (numReady == 0) {
     errno = 0;
-    TORCH_CHECK(false, kConnectTimeoutMsg);
+    TORCH_CHECK(
+        false,
+        c10::str(
+            kConnectTimeoutMsg,
+            " Polled for ",
+            pollTimeout,
+            " ms with original timeout of ",
+            timeout.count(),
+            " ms."));
   }
 
   socklen_t errLen = sizeof(errno);
diff --git a/torch/csrc/distributed/c10d/Utils.cpp b/torch/csrc/distributed/c10d/Utils.cpp
index 5d9aa744dbacd..9d75dc4934e9e 100644
--- a/torch/csrc/distributed/c10d/Utils.cpp
+++ b/torch/csrc/distributed/c10d/Utils.cpp
@@ -228,9 +228,13 @@ void handleConnectException(
     // timeout. A timeout is specified if timeout != kNoTimeout.
     if (timeout != kNoTimeout) {
       const auto elapsed = std::chrono::high_resolution_clock::now() - start;
-      if (elapsed > timeout) {
-        TORCH_CHECK(false, kConnectTimeoutMsg);
-      }
+      TORCH_CHECK(
+          elapsed <= timeout,
+          c10::str(
+              kConnectTimeoutMsg,
+              " Original timeout was ",
+              timeout.count(),
+              " ms."));
     }
     std::this_thread::sleep_for(std::chrono::seconds(1));
     *anyRefused = false;
diff --git a/torch/csrc/distributed/c10d/WinSockUtils.hpp b/torch/csrc/distributed/c10d/WinSockUtils.hpp
index 793a0dc7640f2..268809fdc3806 100644
--- a/torch/csrc/distributed/c10d/WinSockUtils.hpp
+++ b/torch/csrc/distributed/c10d/WinSockUtils.hpp
@@ -46,7 +46,13 @@ inline void waitSocketConnected(
               std::chrono::high_resolution_clock::now() - startTime;
           if (elapsed > timeout) {
             errno = 0;
-            TORCH_CHECK(false, kConnectTimeoutMsg);
+            TORCH_CHECK(
+                false,
+                c10::str(
+                    kConnectTimeoutMsg,
+                    " Original timeout was ",
+                    timeout.count(),
+                    " ms."));
           }
         }
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
diff --git a/torch/distributed/elastic/utils/distributed.py b/torch/distributed/elastic/utils/distributed.py
index 04393f0365027..0cfa7c07c06ae 100644
--- a/torch/distributed/elastic/utils/distributed.py
+++ b/torch/distributed/elastic/utils/distributed.py
@@ -71,7 +71,7 @@ def create_c10d_store(
             # detects timeouts and port conflicts in their own unittests
             # see - caffe2/torch/testing/_internal/common_utils.py
             # TODO properly map the exceptions in pybind (c10d/init.cpp)
-            if str(e) == _CONNECT_TIMEOUT and not is_server:
+            if _CONNECT_TIMEOUT in str(e) and not is_server:
                 raise TimeoutError(
                     f"timed out waiting for tcp store's server: {server_addr}:{port}"
                 ) from e

From f291b1899f55088601013e2bc250bb379b3a7759 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Sat, 10 Jul 2021 13:08:25 -0700
Subject: [PATCH 060/122] Revert D27978269: Smart Decay for Adam - Caffe2

Test Plan: revert-hammer

Differential Revision:
D27978269 (https://github.com/pytorch/pytorch/commit/aaa1e07609b847cb8ec8d93cea1cb0ac51d32f4c)

Original commit changeset: e47524101ddf

fbshipit-source-id: 334824bbf9a6ed788e75af9c292754081f70a19b
---
 caffe2/python/operator_test/adam_test.py | 110 +--------------------
 caffe2/sgd/adam_op.cc                    |  39 --------
 caffe2/sgd/adam_op.h                     | 121 -----------------------
 3 files changed, 4 insertions(+), 266 deletions(-)

diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py
index 5de272b93db29..60d39d831a93e 100644
--- a/caffe2/python/operator_test/adam_test.py
+++ b/caffe2/python/operator_test/adam_test.py
@@ -32,37 +32,6 @@ def ref_adam(param, mom1, mom2, grad, LR, ITER,
         else:
             return param_out, mom1_out, mom2_out
 
-    @staticmethod
-    def ref_smart_decay_adam(param, mom1, mom2, last_seen, grad, LR, ITER,
-                             beta1, beta2, epsilon):
-        t = ITER + 1
-
-        k = int(np.array(t - last_seen).flatten()[0])
-        last_seen_out = t
-
-        if beta1 == 0.0:
-            mom1_out = grad
-            mom2_out = (beta2**k * mom2) + (1 - beta2) * np.square(grad)
-            grad_out = mom1_out / (np.sqrt(mom2_out) + epsilon)
-            param_out = param + LR * grad_out
-            return param_out, mom1_out, mom2_out, last_seen_out
-
-        # Make up for lost minibatches.
-        else:
-            mom2_out = (beta2**k * mom2) + (1 - beta2) * np.square(grad)
-            p_out = param
-            m = mom1
-            # For catchup
-            for i in range(k - 1):
-                m *= beta1
-                update = m / (np.sqrt(mom2_out) + epsilon)
-                p_out += LR * update
-            # For the single step update
-            mom1_out = m * beta1 + grad * (1 - beta1)
-            grad_out = mom1_out / (np.sqrt(mom2_out) + epsilon)
-            param_out = p_out + LR * grad_out
-            return param_out, mom1_out, mom2_out, last_seen_out
-
     @staticmethod
     def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER,
                           beta1, beta2, epsilon, output_grad=False):
@@ -212,77 +181,6 @@ def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER):
             ref_sparse,
             input_device_options=input_device_options)
 
-    @given(inputs=hu.tensors(n=4),
-           ITER=st.integers(min_value=0, max_value=10000),
-           LR=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           beta1=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           beta2=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           data_strategy=st.data(),
-           **hu.gcs)
-    def test_smart_decay_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
-                                     data_strategy, gc, dc):
-        param, mom1, mom2, grad = inputs
-
-        mom2 = np.absolute(mom2)
-        ITER = np.array([ITER], dtype=np.int64)
-        # Here we will define the last_seen tensor as being randomly from 0 to ITER
-        # (the value of t to be tested will be ITER+1)
-        last_seen = np.random.randint(low=0, high=ITER + 1, size=param.shape, dtype=np.int64)
-        LR = np.array([LR], dtype=np.float32)
-
-        # Create an indexing array containing values which index into grad
-        indices = data_strategy.draw(
-            hu.tensor(
-                max_dim=1,
-                min_value=1,
-                max_value=grad.shape[0],
-                dtype=np.int64,
-                elements=st.sampled_from(np.arange(grad.shape[0])),
-            ),
-        )
-
-        # Verify that the generated indices are unique
-        hypothesis.assume(
-            np.array_equal(
-                np.unique(indices.flatten()),
-                np.sort(indices.flatten())))
-
-        # Sparsify grad
-        grad = grad[indices]
-
-        op = core.CreateOperator(
-            "SmartDecaySparseAdam",
-            ["param", "mom1", "mom2", "last_seen", "indices", "grad", "lr", "iter"],
-            ["param", "mom1", "mom2", "last_seen"],
-            beta1=beta1, beta2=beta2, epsilon=epsilon)
-
-        def ref_sparse(param, mom1, mom2, last_seen, indices, grad, LR, ITER):
-            param_out = np.copy(param)
-            mom1_out = np.copy(mom1)
-            mom2_out = np.copy(mom2)
-            last_seen_out = np.copy(last_seen)
-
-            for i, index in enumerate(indices):
-                param_out[index], mom1_out[index], mom2_out[index], last_seen_out[index] = \
-                    self.ref_smart_decay_adam(param[index], mom1[index], mom2[index], last_seen[index],
-                                              grad[i], LR, ITER,
-                                              beta1, beta2, epsilon)
-            return (param_out, mom1_out, mom2_out, last_seen_out)
-
-        # Iter lives on the CPU
-        input_device_options = {'iter': hu.cpu_do}
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, mom1, mom2, last_seen, indices, grad, LR, ITER],
-            ref_sparse,
-            input_device_options=input_device_options)
-
     @given(inputs=hu.tensors(n=4),
            ITER=st.integers(min_value=0, max_value=10000),
            LR=st.floats(min_value=0.01, max_value=0.99,
@@ -296,7 +194,7 @@ def ref_sparse(param, mom1, mom2, last_seen, indices, grad, LR, ITER):
            data_strategy=st.data(),
            **hu.gcs)
     def test_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon,
-                                     data_strategy, gc, dc):
+                         data_strategy, gc, dc):
         param, mom1, mom2, grad = inputs
         mom2 = np.absolute(mom2)
         ITER = np.array([ITER], dtype=np.int64)
@@ -329,7 +227,7 @@ def test_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon,
             beta1=beta1, beta2=beta2, epsilon=epsilon)
 
         def ref_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
-                                   beta1, beta2, epsilon, output_grad):
+                                beta1, beta2, epsilon, output_grad):
             param_out = np.copy(param)
             mom1_out = np.copy(mom1)
             mom2_out = np.copy(mom2)
@@ -448,7 +346,7 @@ def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER):
            data_strategy=st.data(),
            **hu.gcs)
     def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2,
-                                              epsilon, data_strategy, gc, dc):
+                                  epsilon, data_strategy, gc, dc):
         param, mom1, grad = inputs
         ITER = np.array([ITER], dtype=np.int64)
         LR = np.array([LR], dtype=np.float32)
@@ -492,7 +390,7 @@ def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2,
             beta1=beta1, beta2=beta2, epsilon=epsilon)
 
         def ref_row_wise_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
-                                            beta1, beta2, epsilon, output_grad):
+                                        beta1, beta2, epsilon, output_grad):
             param_out = np.copy(param)
             mom1_out = np.copy(mom1)
             mom2_out = np.copy(mom2)
diff --git a/caffe2/sgd/adam_op.cc b/caffe2/sgd/adam_op.cc
index 7e14e3d07df6b..dd74a937c9c19 100644
--- a/caffe2/sgd/adam_op.cc
+++ b/caffe2/sgd/adam_op.cc
@@ -91,45 +91,6 @@ OPERATOR_SCHEMA(SparseAdam)
     .Arg("epsilon", "Default 1e-5")
     .Arg("enableRAdam", "Default false");
 
-REGISTER_CPU_OPERATOR(SmartDecaySparseAdam, SmartDecaySparseAdamOp<float, CPUContext>);
-OPERATOR_SCHEMA(SmartDecaySparseAdam)
-    .NumInputs(8)
-    .NumOutputs(4)
-    .EnforceInplace({{0, 0}, {1, 1}, {2, 2}, {3, 3}})
-    .DeviceInferenceFunction([](const OperatorDef& def) {
-      auto op_device =
-          def.has_device_option() ? def.device_option() : DeviceOption();
-      vector<DeviceOption> in_dev(def.input_size(), op_device);
-      vector<DeviceOption> out_dev(def.output_size(), op_device);
-      // ITER input lives on CPU
-      in_dev[7] = DeviceOption();
-      return std::make_pair(in_dev, out_dev);
-    })
-    .SetDoc(R"DOC(
-
-    Computes the Adam Update for the sparse case.
-    Given inputs (param, moment1, moment2, indices, grad, lr, iter), runs the dense
-    Adam on (param, moment1[indices], momemnt2[indices], lr, iter) and returns
-    (new_param, new_moment1, new_moment2) as in dense case.
-    Adam can be customized as Rectified Adam (RAdam) by setting enableRAdam = true.
-
-    )DOC")
-    .Input(0, "param", "Parameters to be updated")
-    .Input(1, "moment_1", "First moment history")
-    .Input(2, "moment_2", "Second moment history")
-    .Input(3, "last_seen", "Minibatch index when each weight was last seen")
-    .Input(4, "indices", "Sparse indices")
-    .Input(5, "grad", "Gradient computed")
-    .Input(6, "lr", "learning rate")
-    .Input(7, "iter", "iteration number")
-    .Output(0, "output_param", "Updated parameters")
-    .Output(1, "output_moment_1", "Updated first moment")
-    .Output(2, "output_moment_2", "Updated second moment")
-    .Output(3, "output_last_seen", "Updated minibatch index when each weight was last seen")
-    .Arg("beta1", "Default 0.9")
-    .Arg("beta2", "Default 0.999")
-    .Arg("epsilon", "Default 1e-5");
-
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 REGISTER_CPU_OPERATOR(
     RowWiseSparseAdam,
diff --git a/caffe2/sgd/adam_op.h b/caffe2/sgd/adam_op.h
index 4e1b6e345f359..c7a5db65db252 100644
--- a/caffe2/sgd/adam_op.h
+++ b/caffe2/sgd/adam_op.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include "caffe2/core/operator.h"
-#include "caffe2/core/tensor.h"
 
 namespace caffe2 {
 
@@ -53,42 +52,6 @@ void adam_compute(
   }
 }
 
-template <typename Context>
-void adam_compute_smart_decay(
-    int N,
-    long int t,
-    const float* w,
-    const float* g,
-    const float* m,
-    const float* v,
-    const int64_t* lastSeenIn,
-    float* nw,
-    float* nm,
-    float* nv,
-    int64_t* lastSeenOut,
-    float beta1,
-    float beta2,
-    float eps_hat,
-    //float correction,
-    const float* lr,
-    Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
-    float gi = g[i];
-    // The number of steps since this param was last seen.
-    long int k = t - lastSeenIn[i];
-    lastSeenOut[i] = t;
-    // Same as sparse Adam except v is decayed by beta2^k rather than beta2
-    // Catchup = \sum_{i=1}^{k-1}\beta_1^i = \beta_1 \left(\frac{1-\beta_1^k}{1-\beta_1}\right)
-    float catchup = 0.0;
-    if (k > 1) {
-        catchup = m[i] * beta1 * (1 - std::pow(beta1, k)) / (1 - beta1);
-    }
-    float mi = nm[i] = m[i] * std::pow(beta1, k) + gi * (1 - beta1);
-    float vi = nv[i] = v[i] * std::pow(beta2, k) + gi * gi * (1 - beta2);
-    nw[i] = w[i] + (lr[0] * (mi + catchup)) / (std::sqrt(vi) + eps_hat);
-  }
-}
-
 template <typename Context>
 void adam_compute_output_grad(
     int N,
@@ -546,90 +509,6 @@ class SparseAdamOp final : public Operator<Context> {
   OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_GRAD);
 };
 
-template <typename T, class Context>
-class SmartDecaySparseAdamOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  SmartDecaySparseAdamOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        beta1_(this->template GetSingleArgument<float>("beta1", 0.9f)),
-        beta2_(this->template GetSingleArgument<float>("beta2", 0.999f)),
-        epsilon_(this->template GetSingleArgument<float>("epsilon", 1e-5f)) {}
-
-  bool RunOnDevice() override {
-    // Enforce shapes
-    CAFFE_ENFORCE_EQ(Input(PARAM).numel(), Input(MOMENT_1).numel());
-    CAFFE_ENFORCE_EQ(Input(PARAM).numel(), Input(MOMENT_2).numel());
-    CAFFE_ENFORCE_EQ(Input(PARAM).numel(), Input(LAST_SEEN).numel());
-    CAFFE_ENFORCE_EQ(
-        Input(PARAM).size_from_dim(1),
-        Input(GRAD).size_from_dim(Input(INDICES).dim()));
-    CAFFE_ENFORCE_EQ(Input(LR).numel(), 1);
-
-    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, Input(INDICES));
-  }
-
-  template <typename SIndex>
-  bool DoRunWithType() {
-    const auto* lr = Input(LR).template data<T>();
-    const auto iter =
-        OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
-
-    const int64_t t = iter + 1;
-    //const auto beta1_correction = T(1.) / (T(1.) - std::pow(beta1_, t));
-    //const auto beta2_correction =
-    //    T(1.) / std::sqrt(T(1.) - std::pow(beta2_, t));
-    //const auto correction = beta1_correction / beta2_correction;
-
-    auto block_size = Input(PARAM).numel() / Input(PARAM).size(0);
-    auto n = Input(GRAD).numel() / block_size;
-
-    const auto* paramIn = Input(PARAM).template data<T>();
-    const auto* indices = Input(INDICES).template data<SIndex>();
-    const auto* gradIn = Input(GRAD).template data<T>();
-    const auto* moment1In = Input(MOMENT_1).template data<T>();
-    const auto* moment2In = Input(MOMENT_2).template data<T>();
-    const int64_t* lastSeenIn = Input(LAST_SEEN).template data<int64_t>();
-    auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<T>();
-    auto* moment1Out = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
-    auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
-    int64_t* lastSeenOut = Output(OUTPUT_LAST_SEEN)->template mutable_data<int64_t>();
-
-    for (auto i = 0; i < n; ++i) {
-        auto idx = indices[i];
-        auto offsetI = i * block_size;
-        auto offsetIdx = idx * block_size;
-        adam_compute_smart_decay(
-            block_size,
-            t,
-            paramIn + offsetIdx,
-            gradIn + offsetI,
-            moment1In + offsetIdx,
-            moment2In + offsetIdx,
-            lastSeenIn + offsetIdx,
-            paramOut + offsetIdx,
-            moment1Out + offsetIdx,
-            moment2Out + offsetIdx,
-            lastSeenOut + offsetIdx,
-            beta1_,
-            beta2_,
-            epsilon_,
-            lr,
-            &context_);
-    }
-
-    return true;
-  }
-
- protected:
-  T beta1_;
-  T beta2_;
-  T epsilon_;
-  INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, LAST_SEEN, INDICES, GRAD, LR, ITER);
-  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_LAST_SEEN);
-};
-
 template <typename T, class Context>
 class RowWiseSparseAdamOp final : public Operator<Context> {
  public:

From ccd097706005b0636699b0b6bbd016c0d7cd72c5 Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Sat, 10 Jul 2021 14:04:48 -0700
Subject: [PATCH 061/122] [Static Runtime] Support prim::GetAttr/SetAttr
 (#61505)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61505

The handling of `self` in static runtime was previously incorrect. This diff fixed that issue, since self is essential to prim::GetAttr/SetAttr. After all, most of the time we're getting and setting attributes from self, the torch script module.

Reviewed By: ajyu

Differential Revision: D29350173

fbshipit-source-id: 6e62add4cda517ef8cd6c315d4cb0595e7d531fb
---
 test/test_static_runtime.py              |  94 ++++++++++-
 torch/csrc/jit/runtime/static/impl.cpp   | 193 ++++++++++++++---------
 torch/csrc/jit/runtime/static/impl.h     |  31 +++-
 torch/csrc/jit/runtime/static/ops.cpp    |  40 +++--
 torch/csrc/jit/runtime/static/ops.h      |   1 -
 torch/csrc/jit/runtime/static/passes.cpp |  25 +--
 torch/csrc/jit/runtime/static/passes.h   |   6 +-
 7 files changed, 262 insertions(+), 128 deletions(-)

diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py
index 89cf184f76a85..9b38a5a7e36a8 100644
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@@ -1,11 +1,11 @@
-import numpy as np
-import torch
 import unittest
+from typing import Dict, Optional
 
+import numpy as np
+import torch
 from torch import nn
 from torch.testing._internal.common_utils import TestCase, run_tests
 
-from typing import Dict, Optional
 
 class StaticModule:
     def __init__(self, scripted):
@@ -30,7 +30,9 @@ def benchmark_individual_ops(self, args, kwargs, warmup_runs, main_runs):
         )
 
 
-def linear_shim(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+def linear_shim(
+    input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None
+) -> torch.Tensor:
     output = input.matmul(weight.t())
     if bias is not None:
         output += bias
@@ -107,7 +109,8 @@ def trivial_graph(a, b, c):
     s = torch.tensor([[3, 3], [3, 3]])
     return a + b * c + s
 
-def loop_graph(a, b, iters : int):
+
+def loop_graph(a, b, iters: int):
     c = a + b * 2
     for i in range(iters):
         c = c + b
@@ -115,14 +118,50 @@ def loop_graph(a, b, iters : int):
         c -= a
     return c
 
-def output_graph(a, b, c, iters : int):
+
+def output_graph(a, b, c, iters: int):
     s = torch.tensor([[3, 3], [3, 3]])
     k = a + b * c + s
-    d : Dict[int, torch.Tensor] = {}
+    d: Dict[int, torch.Tensor] = {}
     for i in range(iters):
         d[i] = k + i
     return d
 
+
+class SubModule(nn.Module):
+    def __init__(self):
+        super(SubModule, self).__init__()
+        self.a = 11
+        self.b = 2
+
+    def forward(self, x):
+        return self.a + self.b + x
+
+
+class SubModule2(nn.Module):
+    def __init__(self):
+        super(SubModule2, self).__init__()
+        self.a = 12
+        self.b = 2
+
+    def forward(self, x):
+        self.b = 30
+        return self.a + self.b + x
+
+
+class TestModule(nn.Module):
+    def __init__(self):
+        super(TestModule, self).__init__()
+        self.sub1 = SubModule()
+        self.sub2 = SubModule2()
+        self.a = 3
+        self.b = 4
+
+    def forward(self, x):
+        self.b = 20
+        return self.sub1(x) + self.a + self.b + self.sub2(x)
+
+
 class TestStaticModule(TestCase):
     def test_multihead_attention_layer(self):
         HID_DIM = 256
@@ -220,6 +259,46 @@ def test_leaky_relu(self):
         o_test = tg_a(s)[0]
         torch.testing.assert_allclose(o_ref, o_test)
 
+    def test_attr(self):
+        """
+        TorchScript IR of TestModule() after freezing:
+        graph(%self : __torch__.test_static_runtime.___torch_mangle_0.TestModule,
+              %x.1 : Tensor):
+            %18 : int = prim::Constant[value=30]()
+            %30 : int = prim::Constant[value=13]()
+            %3 : int = prim::Constant[value=20]()
+            %2 : int = prim::Constant[value=1]()
+            %self.sub2.a : int = prim::Constant[value=12]()
+            %self.a : int = prim::Constant[value=3]()
+            = prim::SetAttr[name="b"](%self, %3)
+            %17 : Tensor = aten::add(%x.1, %30, %2)
+            %7 : Tensor = aten::add(%17, %self.a, %2)
+            %b.1 : int = prim::GetAttr[name="b"](%self)
+            %9 : Tensor = aten::add(%7, %b.1, %2)
+            %sub2 : __torch__.test_static_runtime.___torch_mangle_2.SubModule2 = prim::GetAttr[name="sub2"](%self)
+            = prim::SetAttr[name="b"](%sub2, %18)
+            %b : int = prim::GetAttr[name="b"](%sub2)
+            %22 : int = aten::add(%self.sub2.a, %b)
+            %23 : Tensor = aten::add(%x.1, %22, %2)
+            %12 : Tensor = aten::add(%9, %23, %2)
+            return (%12)
+        """
+        # test prim::SetAttr and prim::GetAttr impl in Static Runtime
+        m = TestModule()
+
+        m.eval()
+        input = torch.randn(2, 2)
+        output_s = m.forward(input)
+
+        ms = torch.jit.script(m)
+        sm = StaticModule(ms)
+        output_sm = sm(input)[0]
+        torch.testing.assert_allclose(output_s, output_sm)
+        sm.benchmark([input], {}, 2, 2)
+        sm.benchmark_individual_ops([input], {}, 2, 2)
+        sm.benchmark([], {"x": input}, 2, 2)
+        sm.benchmark_individual_ops([], {"x": input}, 2, 2)
+
     @unittest.skip("Temporarily disabled")
     def test_fusion_trivial_graph(self):
         s = torch.full((2, 2), 2)
@@ -281,6 +360,5 @@ def test_fusion_outputs(self):
             torch.testing.assert_allclose(o_ref[i], o_test[i])
 
 
-
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index fd91523382638..7924e1b957a2c 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -45,20 +45,29 @@ void OptimizeGraph(
   ConstantPropagation(graph);
 }
 
-void CheckGraphEligibility(const std::shared_ptr<torch::jit::Graph>& graph) {
-  for (auto n : graph->nodes()) {
-    if (n->kind() == c10::Symbol::fromQualString("prim::GetAttr")) {
-      throw std::runtime_error("Cannot accelerate unfrozen graphs");
+bool CheckGraphEligibility(const std::shared_ptr<torch::jit::Graph>& graph) {
+  // check for sub-blocks
+  bool can_support = true;
+  for (auto* node : graph->block()->nodes()) {
+    for (Block* sub_block : node->blocks()) {
+      VLOG(1) << "Found nested sub-blocks in graph at node: "
+              << PrintNode(node);
+      can_support = false;
     }
   }
+
+  return can_support;
 }
 
 // remove unused input 0 from graph
-void RemoveSelfFromGraphInput(std::shared_ptr<torch::jit::Graph>& graph) {
+bool RemoveSelfFromGraphInput(std::shared_ptr<torch::jit::Graph>& graph) {
   if (graph->inputs().at(0)->type()->is_module()) {
-    TORCH_CHECK(!graph->inputs().at(0)->hasUses());
+    if (graph->inputs().at(0)->hasUses()) {
+      return false;
+    }
     graph->eraseInput(0);
   }
+  return true;
 }
 
 // remove "self" from function schema
@@ -443,12 +452,12 @@ GenerateSameStorageValues(
 void PrepareGraphForStaticModule(
     std::shared_ptr<torch::jit::Graph> graph,
     const StaticModuleOptions& opts) {
-  CheckGraphEligibility(graph);
+  // TODO: call CheckGraphEligibility before trying to enable static runtime
+  TORCH_CHECK(CheckGraphEligibility(graph));
   OptimizeGraph(graph, opts);
-  RemoveSelfFromGraphInput(graph);
 }
 
-std::pair<std::shared_ptr<Graph>, c10::optional<c10::FunctionSchema>>
+std::pair<std::shared_ptr<Graph>, std::shared_ptr<Module>>
 PrepareForStaticModule(
     const torch::jit::Module& m,
     const StaticModuleOptions& opts) {
@@ -461,22 +470,23 @@ PrepareForStaticModule(
   auto module = m.copy();
   module.eval();
 
-  module = freeze_module(module);
+  auto module_ptr = std::make_shared<Module>(freeze_module(module));
+
+  Method method = module_ptr->get_method("forward");
+  auto graph = module_ptr->get_method("forward").graph();
 
-  Method method = module.get_method("forward");
-  auto graph = module.get_method("forward").graph();
+  // graph->dump();
   PrepareGraphForStaticModule(graph, opts);
 
-  c10::FunctionSchema s = RemoveSelfFromSchema(method.function().getSchema());
-  return std::make_pair(graph, s);
+  return std::make_pair(graph, module_ptr);
 }
 
-std::pair<std::shared_ptr<Graph>, c10::optional<c10::FunctionSchema>>
+std::pair<std::shared_ptr<Graph>, std::shared_ptr<Module>>
 PrepareForStaticModule(
     std::shared_ptr<torch::jit::Graph> graph,
     const StaticModuleOptions& opts) {
   PrepareGraphForStaticModule(graph, opts);
-  return std::make_pair(graph, c10::nullopt);
+  return std::make_pair(graph, nullptr);
 }
 
 } // namespace
@@ -492,13 +502,12 @@ StaticModule::StaticModule(
     : StaticModule(PrepareForStaticModule(m, opts), opts) {}
 
 StaticModule::StaticModule(
-    std::pair<
-        std::shared_ptr<torch::jit::Graph>,
-        c10::optional<c10::FunctionSchema>> graph_and_schema,
+    std::pair<std::shared_ptr<torch::jit::Graph>, std::shared_ptr<Module>>
+        graph_and_module,
     const StaticModuleOptions& opts)
     : opts_(opts),
-      graph_(std::move(graph_and_schema.first)),
-      schema_(std::move(graph_and_schema.second)) {
+      graph_(std::move(graph_and_module.first)),
+      module_(std::move(graph_and_module.second)) {
   // check opt flags
   if (opts.optimize_graph_output_memory) {
     TORCH_CHECK(
@@ -511,6 +520,18 @@ StaticModule::StaticModule(
         "When optimize_memory is true, enable_out_variant must be set to true");
   }
 
+  // handle schema
+  if (module_) {
+    Method method = module_->get_method("forward");
+    schema_ = method.function().getSchema();
+    if (RemoveSelfFromGraphInput(graph_)) {
+      schema_ = RemoveSelfFromSchema(method.function().getSchema());
+    } else {
+      first_input_is_self_ = true;
+      schema_ = method.function().getSchema();
+    }
+  }
+
   // map Value* to IValue (from inputs or prim::Constant) or null
   std::unordered_map<Value*, IValue*> value_to_ivalue;
   // map Value* to its SSA definition IR
@@ -620,6 +641,7 @@ StaticRuntime::StaticRuntime(const StaticModule& sm) : static_module_(sm) {
   // NB: create unchanging std::vector<IValue>s we can reference
   inputs_.resize(sm.num_inputs());
   nodes_.resize(sm.nodes().size());
+
   for (const auto idx : c10::irange(sm.nodes().size())) {
     const auto& n_ref = sm.nodes()[idx];
     nodes_[idx] = n_ref; // copy the node
@@ -688,6 +710,43 @@ std::vector<at::Tensor> StaticRuntime::operator()(
   return out;
 }
 
+void StaticRuntime::set_inputs(
+    const std::vector<c10::IValue>& args,
+    const std::unordered_map<std::string, c10::IValue>& kwargs) {
+  if (!kwargs.empty()) {
+    // This is not ideal
+    TORCH_CHECK(
+        static_module_.schema(),
+        "Schema is not available. Consider creating the Static Runtime "
+        "with StaticModule(const torch::jit::Module& m) instead.");
+    std::vector<c10::IValue> stack;
+    stack.reserve(inputs_.size());
+    if (static_module_.first_input_is_self()) {
+      stack.emplace_back(static_module_.module()._ivalue());
+    }
+    stack.insert(stack.end(), args.begin(), args.end());
+
+    static_module_.schema()->checkAndNormalizeInputs(stack, kwargs);
+    DCHECK_EQ(inputs_.size(), stack.size());
+    for (const auto i : c10::irange(stack.size())) {
+      Input(i) = std::move(stack[i]);
+    }
+  } else {
+    if (static_module_.first_input_is_self()) {
+      Input(0) = static_module_.module()._ivalue();
+      DCHECK_EQ(inputs_.size(), args.size() + 1);
+      for (const auto i : c10::irange(args.size())) {
+        Input(i + 1) = args[i];
+      }
+    } else {
+      DCHECK_EQ(inputs_.size(), args.size());
+      for (const auto i : c10::irange(args.size())) {
+        Input(i) = args[i];
+      }
+    }
+  }
+}
+
 c10::IValue StaticRuntime::operator()(
     const std::vector<c10::IValue>& args,
     const std::unordered_map<std::string, c10::IValue>& kwargs) {
@@ -701,27 +760,13 @@ c10::IValue StaticRuntime::operator()(
     planner_->allocate();
   }
 
-  if (!kwargs.empty()) {
-    // This is not ideal
-    TORCH_CHECK(
-        static_module_.schema(),
-        "Schema is not available. Consider creating the Static Runtime "
-        "with StaticModule(const torch::jit::Module& m) instead.");
-    std::vector<c10::IValue> s = args;
-    static_module_.schema()->checkAndNormalizeInputs(s, kwargs);
-    for (const auto i : c10::irange(s.size())) {
-      Input(i) = std::move(s[i]);
-    }
-  } else {
-    for (const auto i : c10::irange(args.size())) {
-      Input(i) = args[i];
-    }
-  }
+  set_inputs(args, kwargs);
 
   // NB: before optimizing the order of execution, ensure that the
   // memory optimization pass (LivenessMap) is
   // aware of the new order!
   for (auto& n : nodes_) {
+    // LOG(INFO) << "Running node: " << PrintNode(n.node());
     n.run();
   }
 
@@ -739,9 +784,7 @@ c10::IValue StaticRuntime::operator()(
     }
     planner_->deallocate();
     // clean up owning refs of input tensors
-    for (IValue& ival : inputs_) {
-      ival = IValue();
-    }
+    clean_up_input_ivalues();
   }
 
   // no need to keep references of outputs in static runtime anymore
@@ -829,6 +872,10 @@ void StaticRuntime::benchmark(
               << "%)" << std::endl;
   }
   check_for_memory_leak();
+
+#ifndef NDEBUG
+  display_nodes(args, kwargs);
+#endif
 }
 
 float StaticRuntime::benchmark_model(
@@ -906,16 +953,36 @@ void display_pnode_info(const ProcessedNode& pnode) {
   }
 }
 
-void StaticRuntime::display_nodes(const std::vector<c10::IValue>& args) {
+void StaticRuntime::display_nodes(
+    const std::vector<c10::IValue>& args,
+    const std::unordered_map<std::string, c10::IValue>& kwargs) {
   c10::InferenceMode mode;
-  std::vector<IValue> stack(args);
-  for (size_t i = 0; i < stack.size(); i++) {
-    Input(i) = stack[i];
+  if (planner_) {
+    planner_->allocate();
   }
+  set_inputs(args, kwargs);
+
   for (auto& node : nodes_) {
     node.run();
     display_pnode_info(node);
   }
+
+  if (static_module_.opts().cleanup_activations) {
+    // MemoryPlanner is created after the first invocation of `run()`. This is
+    // done intentionally because MemoryPlanner uses `Tensor` sizes of the
+    // previous `run()` for memory planning of subsequent runs
+    if (!planner_) {
+      planner_ = std::make_unique<MemoryPlanner>(
+          this,
+          static_module_.values_share_same_storage(),
+          static_module_.external_values(),
+          static_module_.opts().enable_out_variant,
+          static_module_.opts().optimize_graph_output_memory);
+    }
+    planner_->deallocate();
+    // clean up owning refs of input tensors
+    clean_up_input_ivalues();
+  }
 }
 
 StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
@@ -934,18 +1001,9 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
 
   // setup time
   caffe2::Timer timer;
-  std::vector<IValue> stack(args);
-  if (!kwargs.empty()) {
-    // This is not ideal
-    TORCH_CHECK(
-        static_module_.schema(),
-        "Schema is not available. Consider creating the Static Runtime "
-        "with StaticModule(const torch::jit::Module& m) instead.");
-    static_module_.schema()->checkAndNormalizeInputs(stack, kwargs);
-  }
-  for (const auto i : c10::irange(stack.size())) {
-    Input(i) = stack[i];
-  }
+
+  set_inputs(args, kwargs);
+
   results.setup_time = timer.MilliSeconds();
 
   // warmup runs
@@ -957,9 +1015,9 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
   // main runs
   for (const auto k : c10::irange(main_runs)) {
     (void)k; // Suppress unused variable warning
-    for (const auto i : c10::irange(stack.size())) {
-      Input(i) = stack[i];
-    }
+
+    set_inputs(args, kwargs);
+
     timer.Start();
     if (planner_) {
       planner_->allocate();
@@ -985,9 +1043,7 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
       }
       planner_->deallocate();
       // clean up owning refs of input tensors
-      for (IValue& ival : inputs_) {
-        ival = IValue();
-      }
+      clean_up_input_ivalues();
     }
     millis = timer.MilliSeconds();
     results.memory_dealloc_time += millis;
@@ -1283,16 +1339,11 @@ ProcessedNode::ProcessedNode(
     VLOG(1) << "Switch to out variant for node: " << PrintNode(node);
     return;
   }
-  if (!fn_ && mayRunNatively(node)) {
-    native_fn_ = getNativeOperation(node);
-    if (native_fn_) {
-      VLOG(1) << "Switch to native impl for node: " << PrintNode(node);
-      return;
-    }
+  if (!fn_ && (native_fn_ = getNativeOperation(node))) {
+    VLOG(1) << "Switch to native impl for node: " << PrintNode(node);
+    return;
   }
-  if (node->kind() != prim::ListConstruct &&
-      node->kind() != prim::TupleConstruct &&
-      node->kind() != prim::DictConstruct && node->kind() != prim::ListUnpack) {
+  {
     const Operator& op = node->getOperator();
     TORCH_CHECK(op.hasOperation());
     op_ = op.getOperation(node);
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index 01cbcb7a83bf6..5d3527246868a 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -92,9 +92,8 @@ class TORCH_API StaticModule {
 
  private:
   explicit StaticModule(
-      std::pair<
-          std::shared_ptr<torch::jit::Graph>,
-          c10::optional<c10::FunctionSchema>> graph_and_schema,
+      std::pair<std::shared_ptr<torch::jit::Graph>, std::shared_ptr<Module>>
+          graph_and_module,
       const StaticModuleOptions& opts);
 
   // for <kind, idx>
@@ -116,6 +115,10 @@ class TORCH_API StaticModule {
     return *graph_;
   }
 
+  const Module& module() const {
+    return *module_;
+  }
+
   const StaticModuleOptions& opts() const;
   size_t num_inputs() const;
   size_t num_outputs() const;
@@ -149,11 +152,17 @@ class TORCH_API StaticModule {
     return external_values_;
   }
 
+  bool first_input_is_self() const {
+    return first_input_is_self_;
+  }
+
   StaticRuntime& runtime();
 
  private:
   StaticModuleOptions opts_;
+  bool first_input_is_self_{false};
   std::shared_ptr<torch::jit::Graph> graph_;
+  std::shared_ptr<torch::jit::Module> module_;
   c10::optional<c10::FunctionSchema> schema_;
   std::unique_ptr<StaticRuntime> cached_runtime_;
 
@@ -188,7 +197,9 @@ class TORCH_API StaticRuntime {
       const std::vector<c10::IValue>& args,
       const std::unordered_map<std::string, c10::IValue>& kwargs);
 
-  void display_nodes(const std::vector<c10::IValue>& args);
+  void display_nodes(
+      const std::vector<c10::IValue>& args,
+      const std::unordered_map<std::string, c10::IValue>& kwargs);
 
   void benchmark(
       const std::vector<c10::IValue>& args,
@@ -254,6 +265,18 @@ class TORCH_API StaticRuntime {
   void check_for_memory_leak(bool output_returned = true);
 
  private:
+  // helper method for copying input args/kwargs into inputs_
+  void set_inputs(
+      const std::vector<c10::IValue>& args,
+      const std::unordered_map<std::string, c10::IValue>& kwargs);
+
+  // clean up owning refs of input IValues
+  void clean_up_input_ivalues() {
+    for (IValue& ival : inputs_) {
+      ival = IValue();
+    }
+  }
+
   // Memory planning is only enabled if sm->opts().cleanup_activations is true.
   // Otherwise, the memory used by activations is cached inside the static
   // runtime.
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index aa045fd6cbf34..8cc6cae9ba06c 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -189,28 +189,6 @@ std::function<void(ProcessedNode*)> getOutOfPlaceOperation(Node* n) {
   return nullptr;
 }
 
-// TODO: expand to include all view producing ops, mostly in
-// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorShape.cpp
-bool mayRunNatively(Node* n) {
-  // In alphabetical order
-  const static std::unordered_set<std::string> native_nodes{
-      "aten::flatten",
-      "aten::reshape",
-      "aten::slice",
-      "aten::transpose",
-      "aten::to",
-      "prim::ListConstruct",
-      "prim::ListUnpack",
-      "prim::TupleConstruct",
-      "prim::DictConstruct",
-      "aten::__getitem__"};
-  auto str = std::string(n->kind().toQualString());
-  if (!native_nodes.count(str)) {
-    return false;
-  }
-  return true;
-}
-
 // Expensive check, use sparingly.
 // This is needed to make sure that we only switch to out variants for the
 // supported overloads, which is checked in the `Generate` step in
@@ -1302,6 +1280,24 @@ std::function<void(ProcessedNode*)> getNativeOperation(Node* n) {
         p_node->Output(0) = in0_t.clone();
       }
     };
+  } else if (n->kind() == prim::GetAttr) {
+    return [](ProcessedNode* p_node) {
+      auto module = p_node->Input(0).toObject();
+      Node* node = p_node->node();
+      const auto type = node->input()->type()->expect<ClassType>();
+      const auto& field = node->s(attr::name);
+      const auto slot = type->getAttributeSlot(field);
+      p_node->Output(0) = module->getSlot(slot);
+    };
+  } else if (n->kind() == prim::SetAttr) {
+    return [](ProcessedNode* p_node) {
+      auto module = p_node->Input(0).toObject();
+      Node* node = p_node->node();
+      const auto type = node->inputs()[0]->type()->expect<ClassType>();
+      const auto& field = node->s(attr::name);
+      const auto slot = type->getAttributeSlot(field);
+      module->setSlot(slot, p_node->Input(1));
+    };
   }
   return nullptr;
 }
diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h
index d35df5f806587..021cd21fd7253 100644
--- a/torch/csrc/jit/runtime/static/ops.h
+++ b/torch/csrc/jit/runtime/static/ops.h
@@ -124,7 +124,6 @@ bool isOptimizableContainerType(Node* n);
 
 std::function<void(ProcessedNode*)> getOutOfPlaceOperation(Node* n);
 
-bool mayRunNatively(Node* n);
 std::function<void(ProcessedNode*)> getNativeOperation(Node* n);
 
 inline std::string PrintNode(const Node* node) {
diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp
index c4cdea0e94582..940a2aa2e7102 100644
--- a/torch/csrc/jit/runtime/static/passes.cpp
+++ b/torch/csrc/jit/runtime/static/passes.cpp
@@ -366,9 +366,6 @@ void FuseInferenceOpsForSparseNN(std::shared_ptr<torch::jit::Graph>& graph) {
 }
 
 TORCH_LIBRARY_FRAGMENT(static_runtime, m) {
-  m.def("static_runtime::pure_inputs() -> Tensor", []() -> at::Tensor {
-    return at::randn({1});
-  });
   m.def("static_runtime::permute_copy(Tensor self, int[] dims) -> Tensor");
   m.def(
       "static_runtime::reshape_copy(Tensor(a) self, int[] shape) -> Tensor(a)");
@@ -386,24 +383,10 @@ bool HasInplaceOp(std::shared_ptr<Graph>& graph, const AliasDb& alias_db) {
   return HasInplaceOp(graph->block(), alias_db);
 }
 
-void ReplaceWithCopy(std::shared_ptr<torch::jit::Graph>& graph) {
-  auto* fake_input =
-      graph->insert(Symbol::fromQualString("static_runtime::pure_inputs"), {});
-  fake_input->node()->moveBefore(*graph->nodes().begin());
-
-  std::vector<std::pair<Value*, Use>> old_inputs;
-  for (auto* input : graph->inputs()) {
-    for (const auto& use : input->uses()) {
-      old_inputs.emplace_back(std::make_pair(input, use));
-    }
-    input->replaceAllUsesWith(fake_input);
-  }
-
+void ReplaceWithCopy(
+    std::shared_ptr<torch::jit::Graph>& graph,
+    bool outputs_are_immutable) {
   AliasDb db(graph);
-  for (const auto& p : old_inputs) {
-    p.second.user->replaceInput(p.second.offset, p.first);
-  }
-  fake_input->node()->destroy();
 
   const std::map<c10::Symbol, c10::Symbol> supported = {
 #ifdef FBCODE_CAFFE2
@@ -474,7 +457,7 @@ void ReplaceWithCopy(std::shared_ptr<torch::jit::Graph>& graph) {
     }
 
     auto* out = n->output();
-    if (db.mayContainAlias({out}, graph->outputs())) {
+    if (!outputs_are_immutable && db.mayContainAlias({out}, graph->outputs())) {
       continue;
     }
     auto* new_node = graph->create(new_symbol, n->outputs().size());
diff --git a/torch/csrc/jit/runtime/static/passes.h b/torch/csrc/jit/runtime/static/passes.h
index 2becd861d47fc..11ab4bdc7c46a 100644
--- a/torch/csrc/jit/runtime/static/passes.h
+++ b/torch/csrc/jit/runtime/static/passes.h
@@ -7,7 +7,11 @@ TORCH_API void FuseInferenceOpsForSparseNN(
     std::shared_ptr<torch::jit::Graph>& graph);
 TORCH_API void FuseListUnpack(std::shared_ptr<torch::jit::Graph>& graph);
 
-TORCH_API void ReplaceWithCopy(std::shared_ptr<torch::jit::Graph>& graph);
+// If outputs_are_immutable is set to false, don't replace the view ops that
+// produce aliases of graph outputs with the copy version.
+TORCH_API void ReplaceWithCopy(
+    std::shared_ptr<torch::jit::Graph>& graph,
+    bool outputs_are_immutable = true);
 
 TORCH_API bool HasInplaceOp(
     std::shared_ptr<Graph>& graph,

From f5c10fdbd3f1a6b2ec458dc4411dc3b3c69f2350 Mon Sep 17 00:00:00 2001
From: Ansley Ussery <ansley@fb.com>
Date: Sat, 10 Jul 2021 14:27:49 -0700
Subject: [PATCH 062/122] Allow for heterogenous List and Dict values + Improve
 container typing algorithm (#57137)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/57137

This PR corrects and expands our typing algorithm for unannotated, non-empty dicts and lists. Previously, to verify type correctness for an unannotated, non-empty container, we had gotten the type of the first element in the container, then checked if each following element was a subtype of the first type. That's too restrictive--what if the first element were a subtype of the second element? Instead, we should type the container by getting the smallest common supertype of all the given elements.

We need slightly different rules for keys and values in dicts, though: because the set of key types is restricted, finding two key types that cannot be unified should cause an error. On the other hand, the set of value types is not restricted, so we should be able to use `Any` as a valid supertype. We need to keep the set of keys restricted since the keys are used to generate and match schemas.

This does not break backwards compatibility, because the default element type is the smallest supertype of all the given types. So, if someone creates an unannotated dict where the keys are all `str` and the values are all `torch.Tensor`, the dict will be inferred to `Dict[str, Tensor]` just like it was before. Empty lists are still typed as `List[torch.Tensor],` and empty dicts are still typed as `Dict[str, Tensor]`.

This PR unblocks three engineers on an FB-internal team and improves FX-TorchScript compatibility.

Test Plan: Imported from OSS

Reviewed By: gmagogsfm

Differential Revision: D28231839

Pulled By: ansley

fbshipit-source-id: 7297bf239749daa54895add708185c75e6ca5999
---
 aten/src/ATen/core/jit_type.h          |  12 +-
 aten/src/ATen/core/type.cpp            |  23 +-
 test/HowToWriteTestsUsingFileCheck.md  |   3 +
 test/jit/test_list_dict.py             |   8 +-
 test/jit/test_types.py                 |   4 +-
 test/jit/test_typing.py                | 129 +++++++++--
 test/test_jit.py                       |   5 +-
 torch/csrc/jit/frontend/ir_emitter.cpp | 299 ++++++++++++++++++++-----
 8 files changed, 393 insertions(+), 90 deletions(-)

diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 9fafdfd896f09..d733fbd2da5b1 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -1574,15 +1574,21 @@ inline at::ScalarType scalarTypeFromJitType(const c10::TypePtr& type) {
 // then t2 will be returned (and vice versa).
 // Two different tensortypes will return dynamic.
 // Currently we chose not to support returning a NumberType for a float & int
-// input because of a lack of operator support for NumberType
+// input because of a lack of operator support for NumberType.
+// If `type_hint` is an `InterfaceType`, then we can use that as a
+// potential supertype for `ClassType`s in the list. Otherwise, we have
+// no way to find and use some common interface type
 TORCH_API c10::optional<TypePtr> unifyTypes(
     const TypePtr& t1,
     const TypePtr& t2,
-    bool default_to_any = false);
+    bool default_to_any = false,
+    TypePtr type_hint=nullptr);
 
 TORCH_API c10::optional<TypePtr> unifyTypeList(
     at::ArrayRef<TypePtr> elements,
-    std::ostream& why_not);
+    std::ostream& why_not,
+    bool default_to_any=false,
+    TypePtr type_hint=nullptr);
 
 namespace detail {
 template <typename T>
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index fa467b7a0055f..e8cffebcfaf59 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -265,7 +265,7 @@ AnyEnumTypePtr AnyEnumType::get() {
   return value;
 }
 
-c10::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t2) {
+c10::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool default_to_any=false, TypePtr type_hint=nullptr) {
   // check direct subtyping relation
   if (t1->isSubtypeOf(t2)) {
     return t2;
@@ -308,7 +308,7 @@ c10::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t2) {
     }
     std::vector<TypePtr> elements;
     for (size_t i = 0; i < tuple1->elements().size(); i++) {
-      if (auto elem = unifyTypes(tuple1->elements().at(i), tuple2->elements().at(i))) {
+      if (auto elem = unifyTypes(tuple1->elements().at(i), tuple2->elements().at(i), default_to_any)) {
         elements.push_back(*elem);
       } else {
         return c10::nullopt;
@@ -337,11 +337,18 @@ c10::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t2) {
     return t1_unshaped;
   }
 
+  // Check whether or not `type_hint` is a common parent. This case
+  // could occur if we had two class types that had been annotated with
+  // a common interface
+  if (type_hint && t1->isSubtypeOf(type_hint) && t2->isSubtypeOf(type_hint)) {
+    return type_hint;
+  }
+
   return c10::nullopt;
 }
 
-c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2, bool default_to_any) {
-  auto unified = unifyTypesImpl(t1, t2);
+c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2, bool default_to_any, TypePtr type_hint) {
+  auto unified = unifyTypesImpl(t1, t2, default_to_any, type_hint);
 
   if (default_to_any && !unified) {
     return AnyType::get();
@@ -352,7 +359,9 @@ c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2, bool def
 
 c10::optional<TypePtr> unifyTypeList(
     at::ArrayRef<TypePtr> elements,
-    std::ostream& why_not) {
+    std::ostream& why_not,
+    bool default_to_any,
+    TypePtr type_hint) {
   if (elements.size() == 0) {
     why_not << "Cannot get unified type from empty list";
     return c10::nullopt;
@@ -360,7 +369,7 @@ c10::optional<TypePtr> unifyTypeList(
 
   TypePtr ret_type = elements.at(0);
   for (size_t i = 1; i < elements.size() && ret_type; ++i) {
-    auto maybe_unified = unifyTypes(ret_type, elements.at(i));
+    c10::optional<TypePtr> maybe_unified = unifyTypes(ret_type, elements.at(i), default_to_any, type_hint);
     if (!maybe_unified) {
       why_not << "Could not unify type list since element " << i << " of type "
               << elements.at(i)->repr_str()
@@ -368,7 +377,7 @@ c10::optional<TypePtr> unifyTypeList(
               << ret_type->repr_str() << ")";
       return c10::nullopt;
     }
-    ret_type = maybe_unified.value();
+    ret_type = *maybe_unified;
   }
 
   return ret_type;
diff --git a/test/HowToWriteTestsUsingFileCheck.md b/test/HowToWriteTestsUsingFileCheck.md
index 429d7a06b4825..0795c23002a16 100644
--- a/test/HowToWriteTestsUsingFileCheck.md
+++ b/test/HowToWriteTestsUsingFileCheck.md
@@ -79,6 +79,9 @@ annotations from the example above one would write:
 
 * `CHECK: <pattern>`
   Scans the input until `PATTERN` is found. Fails if the pattern is not found.
+* `CHECK-NEXT: <pattern>`
+  Scans the input on the line immediately following the previous CHECK until
+  `PATTERN` is found. Fails if the pattern is not found on that line.
 * `CHECK-NOT: <pattern>`
   Scans the input and fails if `PATTERN` is found on any line. The scan stops when
   a match for a next `CHECK` is found.
diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py
index a7d30dae87470..d8434515291ab 100644
--- a/test/jit/test_list_dict.py
+++ b/test/jit/test_list_dict.py
@@ -244,10 +244,10 @@ def fn():
         self.checkScript(fn, ())
 
     def test_dict_keyword_with_mismatched_annotations(self):
-        # TODO: This fails during function schema matching, so the error
-        # message is not very informative to the user. Change logic so
-        # that the error is thrown at a different time?
-        err_msg = "Arguments for call are not valid"
+        err_msg = r"Dict type annotation `Dict\[int, str\]` did not "\
+                  "match the types of the actual dict items"
+        err_msg = r"Dict type annotation `Dict\[int, str\]` did not "\
+                  "match the type of an actual key type `str`"
         highlight_msg = "dict([(\"foo\", 1), (\"bar\", 2), (\"baz\", 3"
         with self.assertRaisesRegexWithHighlight(RuntimeError, err_msg, highlight_msg):
             @torch.jit.script
diff --git a/test/jit/test_types.py b/test/jit/test_types.py
index e7edc4734b4c5..5da4efde3746c 100644
--- a/test/jit/test_types.py
+++ b/test/jit/test_types.py
@@ -140,7 +140,9 @@ def wrong_type():
             wrong : List[int] = [0.5]
             return wrong
 
-        with self.assertRaisesRegex(RuntimeError, "Lists must contain only a single type"):
+        with self.assertRaisesRegex(RuntimeError, "List type annotation"
+                                    r" `List\[int\]` did not match the "
+                                    "types of the given list elements"):
             torch.jit.script(wrong_type)
 
     def test_optional_no_element_type_annotation(self):
diff --git a/test/jit/test_typing.py b/test/jit/test_typing.py
index f6797527556fe..f60f25f782e95 100644
--- a/test/jit/test_typing.py
+++ b/test/jit/test_typing.py
@@ -2,6 +2,7 @@
 import sys
 
 import torch
+from torch.testing import FileCheck
 from torch.testing._internal.jit_utils import JitTestCase
 from torch.testing._internal.common_utils import IS_WINDOWS
 from collections import namedtuple
@@ -73,11 +74,119 @@ def test_dict_tensor_key(a, t):
         self.checkScript(test_dict_tensor_key, (dict_a, inp1))
         self.checkScript(test_dict_tensor_key, (dict_a, inp2))
 
-    def test_dict_types(self):
-        with self.assertRaisesRegex(RuntimeError, "single type"):
-            @torch.jit.script
-            def foo():
-                new_item = {'score': [1.0], 'ys': [1, 2, 3]}
+    def test_list_type_refinement_defaults_to_Any_list_creation(self):
+        def fn(x):
+            tup1 = ("foo", torch.tensor(2))
+            tup2 = ("bar", {"23": torch.tensor(3)})
+            tup3 = ("baz", x)
+            l = list((tup1, tup2))  # noqa: C410
+            l.append(tup3)
+            tup4 = l[0]
+            if torch.jit.isinstance(tup4, Tuple[str, torch.Tensor]):
+                t = tup4[1]
+                if isinstance(t, torch.Tensor):
+                    l[0] = (tup4[0], torch.add(t, t))
+            return l
+
+        self.checkScript(fn, (torch.arange(5),))
+
+        graph = torch.jit.script(fn).graph
+
+        print(graph)
+
+        # Check that we're making a `List[Tuple[str, Any]]`
+        FileCheck().check(r"(str, Any)[] = prim::ListConstruct").run(graph)
+
+    def test_list_type_refinement_defaults_to_Any_list_comprehension(self):
+        def fn(x):
+            tup1 = ("foo", torch.tensor(2))
+            tup2 = ("bar", {"23": torch.tensor(3)})
+            tup3 = ("baz", x)
+            l_ = [tup1, tup2]
+            l = [t for t in l_]    # noqa: C416
+            l.append(tup3)
+            tup4 = l[0]
+            if torch.jit.isinstance(tup4, Tuple[str, torch.Tensor]):
+                t = tup4[1]
+                if isinstance(t, torch.Tensor):
+                    l[0] = (tup4[0], torch.add(t, t))
+            return l
+
+        self.checkScript(fn, (torch.arange(5),))
+
+        graph = torch.jit.script(fn).graph
+
+        print(graph)
+
+        # Check that we're making a `List[Tuple[str, Any]]`
+        FileCheck().check(r"(str, Any)[] = prim::ListConstruct").run(graph)
+
+    def test_list_type_refinement_annotation_element_mismatch(self):
+        def fn():
+            l: List[int] = [1, 2, "foo", 3]
+            return l
+
+        with self.assertRaisesRegex(RuntimeError, "List type annotation"
+                                    r" `List\[int\]` did not match the "
+                                    "types of the given list elements"):
+            torch.jit.script(fn)
+
+    def test_dict_type_refinement_defaults_to_Any_dict_creation(self):
+        def fn(x):
+            d = dict(foo=torch.tensor(2),
+                     bar={"23": torch.tensor(3)})
+            d["baz"] = x
+            t = d["foo"]
+            if isinstance(t, torch.Tensor):
+                d["bar"] = torch.add(t, t)
+            return d
+
+        self.checkScript(fn, (torch.arange(5),))
+
+        graph = torch.jit.script(fn).graph
+
+        FileCheck().check(r"Dict(str, Any) = prim::DictConstruct").run(graph)
+
+    def test_dict_type_refinement_defaults_to_Any_dict_comprehension(self):
+        def fn(x):
+            d = {"foo": torch.tensor(2),
+                 "bar": {"23": torch.tensor(3)}}
+            d["baz"] = x
+            t = d["foo"]
+            if isinstance(t, torch.Tensor):
+                d["bar"] = torch.add(t, t)
+            return d
+
+        self.checkScript(fn, (torch.arange(5),))
+
+        graph = torch.jit.script(fn).graph
+
+        FileCheck().check("Dict(str, Any) = prim::DictConstruct").run(graph)
+
+    def test_dict_type_refinement_annotation_key_mismatch(self):
+        def fn():
+            l1 = [1, 2, "foo", 3]
+            l2 = ["foo", "bar", "baz", "qux"]
+            d: Dict[int, str] = {k : v for k, v in zip(l1, l2)}
+            return l
+
+        with self.assertRaisesRegex(RuntimeError, "Dict type annotation"
+                                    r" `Dict\[int, str\]` did not match"
+                                    " the type of an actual key type"):
+            torch.jit.script(fn)
+
+    def test_dict_type_refinement_annotation_value_mismatch(self):
+        def fn():
+            l1 = ["foo", "bar", "baz", "qux"]
+            l2 = [1, 2, "foo", 3]
+            d: Dict[str, int] = {k : v for k, v in zip(l1, l2)}
+            return l
+
+        with self.assertRaisesRegex(RuntimeError, "Dict type annotation"
+                                    r" `Dict\[str, int\]` did not match"
+                                    " the type of an actual value "
+                                    "type"):
+            torch.jit.script(fn)
 
     def test_dict_invalid_annotations(self):
         # Check for invalid value type annotation
@@ -200,16 +309,6 @@ def fn2(x):
         self.checkScript(fn, [])
         self.checkScript(fn2, (torch.ones(2, 2),))
 
-        with self.assertRaisesRegex(RuntimeError, "Could not unify"):
-            @torch.jit.script
-            def fn():
-                return [1, 1.2]
-
-        with self.assertRaisesRegex(RuntimeError, "Could not unify"):
-            @torch.jit.script
-            def fn():
-                return [1, torch.ones(1, 2)]
-
     # to avoid defining sum_list in multiple tests
     def get_sum_list_fn(self):
         def sum_list(a):
diff --git a/test/test_jit.py b/test/test_jit.py
index 3bfe6cf841957..5d4096b4aab32 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -10584,7 +10584,10 @@ def f4(a):
             def f5(a):
                 torch.cat([3])
 
-        with self.assertRaisesRegex(RuntimeError, 'Lists must contain only a single type'):
+        with self.assertRaisesRegex(RuntimeError, r'Expected a value of'
+                                    r' type \'List\[int\]\' for argument'
+                                    r' \'size\' but instead found type '
+                                    r'\'List\[Any\]\''):
             @torch.jit.script
             def f6(a):
                 a.expand(size=[3, [4]])
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index 0133a48edb0e1..f54a3601bf168 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -1321,13 +1321,51 @@ struct to_ir {
     pushFrame(comprehension_block);
     WithInsertPoint guard(comprehension_block);
     auto emit_body = [&]() {
-      auto comprehension_out = emitExpr(lc.elt());
+      Value* out = emitExpr(lc.elt());
+
+      // If we didn't have a type annotation, the type of the list would
+      // be set to `Tensor`. We don't want to unify this default type
+      // with the actual elements in the list, so let the type begin as
+      // the first element in the list
       if (!type_set) {
-        list_value->setType(ListType::create(comprehension_out->type()));
+        list_value->setType(ListType::create(out->type()));
         type_set = true;
       }
+
+      ListTypePtr lt = list_value->type()->expect<ListType>();
+
+      const TypePtr element_type_hint =
+          type_hint ? type_hint->expect<ListType>()->getElementType() : nullptr;
+
+      auto unified = unifyTypes(
+          lt->getElementType(),
+          out->type(),
+          /*default_to_any=*/true,
+          element_type_hint);
+
+      if (lt->getElementType() != AnyType::get() &&
+          *unified == AnyType::get()) {
+        TORCH_WARN(
+            "List consists of heterogeneous types, which means",
+            " that it has been typed as `List[Any]`. To use "
+            "any of the values in the List, it will be "
+            "necessary to add an `assert isinstance` statement "
+            "before first use to trigger type refinement. The first ",
+            "non-matching element was typed as ",
+            out->type()->repr_str(),
+            ", while the elements before it "
+            "were ",
+            lt->getElementType()->repr_str(),
+            "\n",
+            lc.range().str());
+      }
+
+      if (!type_hint) {
+        list_value->setType(ListType::create(*unified));
+      }
+
       NamedValue self = NamedValue(loc, "self", list_value);
-      NamedValue input = NamedValue(loc, "", comprehension_out);
+      NamedValue input = NamedValue(loc, "", out);
       emitBuiltinCall(loc, *graph, aten::append, {input}, {}, self);
     };
     emitFor(targets_list, itrs, loc, emit_body);
@@ -1366,10 +1404,88 @@ struct to_ir {
     auto emit_body = [&]() {
       auto k = emitExpr(dc.key());
       auto v = emitExpr(dc.value());
+
+      // Make sure that any key and value types are subtypes of the
+      // annotatated key/value types
+      if (type_hint) {
+        DictTypePtr dict_type_hint = type_hint->expect<DictType>();
+
+        std::stringstream ss;
+        std::stringstream err;
+
+        bool is_key_subtype =
+            k->type()->isSubtypeOfExt(dict_type_hint->getKeyType(), &ss);
+
+        if (!is_key_subtype) {
+          err << "Dict type annotation `" << dict_type_hint->repr_str()
+              << "` did not match the "
+              << "type of an actual key type `" << k->type()->repr_str()
+              << "`\n"
+              << ss.str();
+        }
+
+        ss.str(std::string());
+        bool is_value_subtype =
+            v->type()->isSubtypeOfExt(dict_type_hint->getValueType(), &ss);
+
+        if (!is_value_subtype) {
+          err << "Dict type annotation `" << dict_type_hint->repr_str()
+              << "` did not match the "
+              << "type of an actual value type `" << v->type()->repr_str()
+              << "`\n"
+              << ss.str();
+        }
+
+        if (!is_key_subtype || !is_value_subtype) {
+          throw ErrorReport(dc) << err.str();
+        }
+      }
+
+      // If we didn't have a type annotation, the type of the dict would
+      // be set to `(str, Tensor)`. We don't want to unify this default
+      // type with the actual elements in the dict, so let the type
+      // begin as the first element in the dict
       if (!type_set) {
         dict_value->setType(DictType::create(k->type(), v->type()));
         type_set = true;
       }
+
+      DictTypePtr dt = dict_value->type()->expect<DictType>();
+
+      const TypePtr value_type_hint =
+          type_hint ? type_hint->expect<DictType>()->getKeyType() : nullptr;
+
+      c10::optional<TypePtr> unified = unifyTypes(
+          dt->getValueType(),
+          v->type(),
+          /*default_to_any=*/true,
+          value_type_hint);
+
+      // Warn the user if we inferred the type of the values to be `Any`
+      // even though the annotation was something else
+      if (dt->getValueType() != AnyType::get() && *unified == AnyType::get()) {
+        TORCH_WARN(
+            "Dict consists of heterogeneous types, which means",
+            " that it has been typed as `Dict[str, Any]`. To use "
+            "any of the values in the Dict, it will be "
+            "necessary to add an `assert isinstance` statement "
+            "before first use to trigger type refinement. The first ",
+            "non-matching element was typed as ",
+            v->type()->repr_str(),
+            ", while the elements before it "
+            "were ",
+            dt->getValueType()->repr_str(),
+            "\n",
+            dc.range().str());
+      }
+
+      // We only want to set `dict_value` if we don't have a type hint
+      // to allow for the case that `*unified` is a subtype of
+      // the value type given by `type_hint`
+      if (!type_hint) {
+        dict_value->setType(DictType::create(k->type(), *unified));
+      }
+
       NamedValue self = NamedValue(loc, "self", dict_value);
       NamedValue input_k = NamedValue(loc, "", k);
       NamedValue input_v = NamedValue(loc, "", v);
@@ -3534,6 +3650,66 @@ struct to_ir {
             ->call(tree->range(), method, named_values, {}, 0));
   }
 
+  Value* emitListLiteral(ListLiteral ll, TypePtr type_hint) {
+    auto values = getValues(ll.inputs(), /*maybe_unpack=*/true);
+
+    // Determine the element type of the list. If we have a type hint
+    // of `List[T]`, use `T`. If the list is non-empty, find the
+    // greatest common supertype of all the list elements (defaulting to
+    // `Any` as a catch-all supertype). Assume `[]` is `List[Tensor]`
+    TypePtr elem_type = TensorType::get();
+
+    if (type_hint) {
+      if (type_hint->kind() == TypeKind::ListType) {
+        elem_type = type_hint->expectRef<ListType>().getElementType();
+      } else {
+        // If the type hint was not `List[T]`, throw an error
+        throw ErrorReport(ll) << "Expected a List type hint but instead got "
+                              << type_hint->repr_str();
+      }
+    }
+
+    if (!values.empty()) {
+      auto types = fmap(values, [](const Value* v) { return v->type(); });
+
+      std::stringstream nowhere; // never used
+
+      const TypePtr element_type_hint =
+          type_hint ? type_hint->expect<ListType>()->getElementType() : nullptr;
+
+      c10::optional<TypePtr> unified = unifyTypeList(
+          types, nowhere, /*default_to_any=*/true, element_type_hint);
+
+      if (!type_hint && *unified == AnyType::get()) {
+        TORCH_WARN(
+            "List consists of heterogeneous types, which means",
+            " that it has been typed as `List[Any]`. To use "
+            "any of the values in the List, it will be "
+            "necessary to add an `assert isinstance` statement "
+            "before first use to trigger type refinement. \n",
+            ll.range().str());
+      }
+
+      if (type_hint && !(*unified)->isSubtypeOf(elem_type)) {
+        throw ErrorReport(ll)
+            << "List type annotation `" << type_hint->repr_str()
+            << "` did not match the types of the given list elements,"
+            << " which were unified to " << (*unified)->repr_str();
+      }
+
+      // We only want to set `elem_type` if we don't have a type hint
+      // to allow for the case that `*unified` is a subtype of
+      // `type_hint`
+      if (!type_hint) {
+        elem_type = *unified;
+      }
+    }
+
+    Value* result =
+        graph->insertNode(graph->createList(elem_type, values))->output();
+    return result;
+  }
+
   Value* emitSimpleExpr(
       const TreeRef& tree,
       const TypePtr& type_hint = nullptr) {
@@ -3616,46 +3792,7 @@ struct to_ir {
       } break;
       case TK_LIST_LITERAL: {
         auto ll = ListLiteral(tree);
-        auto values = getValues(ll.inputs(), /*maybe_unpack=*/true);
-
-        // determine the element type of the list
-        // if we have a type hint of List[T], use T
-        // if the list is non-empty use type_of(list[0])
-        // otherwise assume it is List[Tensor]
-        TypePtr elem_type = TensorType::get();
-        if (type_hint) {
-          if (type_hint->kind() == TypeKind::ListType) {
-            elem_type = type_hint->expectRef<ListType>().getElementType();
-          } else {
-            // If the type hint was not a List[T] throw an error
-            throw ErrorReport(tree)
-                << "Expected a List type hint but instead got "
-                << type_hint->repr_str();
-          }
-        } else if (!values.empty()) {
-          std::stringstream ss;
-          auto types = fmap(values, [](const Value* v) { return v->type(); });
-          auto maybe_elem_type = unifyTypeList(types, ss);
-          if (!maybe_elem_type) {
-            throw ErrorReport(tree) << "Lists must contain only a single type\n"
-                                    << ss.str();
-          }
-          elem_type = maybe_elem_type.value();
-        }
-
-        for (auto v : values) {
-          std::stringstream ss;
-          if (!v->type()->isSubtypeOfExt(elem_type, &ss)) {
-            throw ErrorReport(tree)
-                << "Lists must contain only a single type, expected: "
-                << elem_type->repr_str() << " but found "
-                << v->type()->repr_str() << " instead.\n"
-                << ss.str();
-          }
-        }
-        Value* result =
-            graph->insertNode(graph->createList(elem_type, values))->output();
-        return result;
+        return emitListLiteral(ll, type_hint);
       } break;
       case TK_TUPLE_LITERAL: {
         auto ll = TupleLiteral(tree);
@@ -3690,24 +3827,68 @@ struct to_ir {
         }
         AT_ASSERT(key_type != nullptr && value_type != nullptr);
 
-        auto checkTypeOfValues = [](const TypePtr& type,
-                                    const char* what,
-                                    const std::vector<Value*>& values,
-                                    TreeList trees) {
-          for (size_t i = 0, N = values.size(); i < N; ++i) {
-            std::stringstream ss;
-            if (!values[i]->type()->isSubtypeOfExt(type, &ss)) {
-              throw ErrorReport(trees[i])
-                  << "Dict " << what
-                  << " must contain only a single type, expected: "
-                  << type->repr_str() << " but found "
-                  << values[i]->type()->repr_str() << " instead.\n"
-                  << ss.str();
+        for (size_t i = 0; i < keys.size(); ++i) {
+          std::stringstream ss;
+          if (!keys[i]->type()->isSubtypeOfExt(key_type, &ss)) {
+            throw ErrorReport(key_trees[i])
+                << "Dict keys must contain "
+                << "only a single type. Expected: " << key_type->repr_str()
+                << " but found " << keys[i]->type()->repr_str() << " instead.\n"
+                << ss.str();
+          }
+        }
+
+        if (!values.empty()) {
+          auto types = fmap(values, [](const Value* v) { return v->type(); });
+
+          std::stringstream nowhere; // never used
+
+          const TypePtr value_type_hint =
+              type_hint ? type_hint->expect<DictType>()->getKeyType() : nullptr;
+
+          c10::optional<TypePtr> unified = unifyTypeList(
+              types,
+              /*why_not=*/nowhere,
+              /*default_to_any=*/true,
+              value_type_hint);
+
+          if (!type_hint && *unified == AnyType::get()) {
+            TORCH_WARN(
+                "Dict values consist of heterogeneous types, which "
+                "means that they have been typed as `Any`. To use "
+                "any of the values in the Dist, it will be "
+                "necessary to add an `assert isinstance` statement "
+                "before first use to trigger type refinement. \n",
+                dl.range().str());
+          }
+
+          if (type_hint) {
+            TypePtr value_type_hint =
+                type_hint->expect<DictType>()->getValueType();
+            for (size_t i = 0; i < types.size(); ++i) {
+              TORCH_CHECK(
+                  types[i]->isSubtypeOf(value_type_hint),
+                  "Type "
+                  "hint for dict was",
+                  type_hint->repr_str(),
+                  "but the value ",
+                  "at index ",
+                  i,
+                  " has type ",
+                  types[i]->repr_str(),
+                  ", which is not a valid"
+                  " subtype of ",
+                  value_type_hint->repr_str());
             }
           }
-        };
-        checkTypeOfValues(key_type, "keys", keys, key_trees);
-        checkTypeOfValues(value_type, "values", values, value_trees);
+
+          // We only want to set `value_type` if we don't have a type
+          // hint to allow for the case that `*unified` is a subtype of
+          // the value type given by `type_hint`
+          if (!type_hint) {
+            value_type = *unified;
+          }
+        }
 
         return graph
             ->insertNode(graph->createDict(key_type, value_type, keys, values))

From 2942e9aa802537aac9e04bf79299b559533ed060 Mon Sep 17 00:00:00 2001
From: David Reiss <dreiss@fb.com>
Date: Sat, 10 Jul 2021 15:13:39 -0700
Subject: [PATCH 063/122] model_dump: update maintainer comment (#60698)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/60698

... to reflect that the Python command should be re-run when changing
the model.

Test Plan: CI

Reviewed By: dhruvbird

Differential Revision: D29380399

Pulled By: dreiss

fbshipit-source-id: 1ec464da4ebe6ddf400eb4a3b14da683369c0039
---
 torch/utils/model_dump/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/utils/model_dump/__init__.py b/torch/utils/model_dump/__init__.py
index c4a89a0282208..ad38d66f771ce 100644
--- a/torch/utils/model_dump/__init__.py
+++ b/torch/utils/model_dump/__init__.py
@@ -41,7 +41,8 @@
 in the browser.  In another terminal, run
 "python -m torch.utils.model_dump --style=json FILE > \
     torch/utils/model_dump/model_info.json"
-every time you update the Python code.  When you update JS, just refresh.
+every time you update the Python code or model.
+When you update JS, just refresh.
 
 Possible improvements:
     - Fix various TODO comments in this file and the JS.

From e292f34def06889f3d7e06cc065920d93a1f763b Mon Sep 17 00:00:00 2001
From: David Reiss <dreiss@fb.com>
Date: Sat, 10 Jul 2021 15:13:39 -0700
Subject: [PATCH 064/122] model_dump: Make stdout argument for main a
 keyword-only argument (#60699)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/60699

Also add a unit test for main, which brings the test coverage up to
~98%.  Also factor out the "needs importlib.resources" check into a
function for easier reuse.

Test Plan: CI

Reviewed By: dhruvbird

Differential Revision: D29380397

Pulled By: dreiss

fbshipit-source-id: bba16da85bf7bfb4370308e38c844694d01b47eb
---
 test/test_model_dump.py            | 40 ++++++++++++++++++++++++++++--
 torch/utils/model_dump/__init__.py |  2 +-
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/test/test_model_dump.py b/test/test_model_dump.py
index 43ab99e5ced09..cb2397747b835 100644
--- a/test/test_model_dump.py
+++ b/test/test_model_dump.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python3
 import sys
 import io
+import tempfile
 import unittest
 
 import torch
 import torch.utils.model_dump
 import torch.utils.mobile_optimizer
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS
 from torch.testing._internal.common_quantized import supported_qengines
 
 
@@ -56,8 +57,12 @@ def forward(self, arg):
 
 
 class TestModelDump(TestCase):
-    @unittest.skipIf(sys.version_info < (3, 7), "importlib.resources was new in 3.7")
+    def needs_resources(self):
+        if sys.version_info < (3, 7):
+            self.skipTest("importlib.resources was new in 3.7")
+
     def test_inline_skeleton(self):
+        self.needs_resources()
         skel = torch.utils.model_dump.get_inline_skeleton()
         assert "unpkg.org" not in skel
         assert "src=" not in skel
@@ -77,6 +82,37 @@ def test_traced_model(self):
         model = torch.jit.trace(SimpleModel(), torch.zeros(2, 16))
         self.do_dump_model(model)
 
+    def test_main(self):
+        self.needs_resources()
+        if IS_WINDOWS:
+            # I was getting tempfile errors in CI.  Just skip it.
+            self.skipTest("Disabled on Windows.")
+
+        with tempfile.NamedTemporaryFile() as tf:
+            torch.jit.save(torch.jit.script(SimpleModel()), tf)
+
+            stdout = io.StringIO()
+            torch.utils.model_dump.main(
+                [
+                    None,
+                    "--style=json",
+                    tf.name,
+                ],
+                stdout=stdout)
+            self.assertRegex(stdout.getvalue(), r'\A{.*SimpleModel')
+
+            stdout = io.StringIO()
+            torch.utils.model_dump.main(
+                [
+                    None,
+                    "--style=html",
+                    tf.name,
+                ],
+                stdout=stdout)
+            self.assertRegex(
+                stdout.getvalue().replace("\n", " "),
+                r'\A<!DOCTYPE.*SimpleModel.*componentDidMount')
+
     def get_quant_model(self):
         fmodel = QuantModel().eval()
         fmodel = torch.quantization.fuse_modules(fmodel, [
diff --git a/torch/utils/model_dump/__init__.py b/torch/utils/model_dump/__init__.py
index ad38d66f771ce..b3154281e3825 100644
--- a/torch/utils/model_dump/__init__.py
+++ b/torch/utils/model_dump/__init__.py
@@ -361,7 +361,7 @@ def burn_in_info(skeleton, info):
         "BURNED_IN_MODEL_INFO = " + json.dumps(info).replace("/", "\\/"))
 
 
-def main(argv, stdout=None):
+def main(argv, *, stdout=None):
     parser = argparse.ArgumentParser()
     parser.add_argument("--style", choices=["json", "html"])
     parser.add_argument("--title")

From cc78c463c0defbc15bc281f442bbfbdb90de188e Mon Sep 17 00:00:00 2001
From: David Reiss <dreiss@fb.com>
Date: Sat, 10 Jul 2021 15:13:39 -0700
Subject: [PATCH 065/122] model_dump: Render constants.pkl similar to data.pkl
 (#60700)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/60700

Test Plan:
Dumped a model with a lot of constants (qconvs produced by optimizing).
Was able to see them rendered nicely.

Reviewed By: dhruvbird

Differential Revision: D29380400

Pulled By: dreiss

fbshipit-source-id: c951508b92bb2717591dd173282157e1a40a30bd
---
 torch/utils/model_dump/__init__.py | 12 +++++++++---
 torch/utils/model_dump/code.js     | 21 +++++++++++----------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/torch/utils/model_dump/__init__.py b/torch/utils/model_dump/__init__.py
index b3154281e3825..85be65281070e 100644
--- a/torch/utils/model_dump/__init__.py
+++ b/torch/utils/model_dump/__init__.py
@@ -221,9 +221,14 @@ def get_model_info(
         assert path_prefix is not None
         version = zf.read(path_prefix + "/version").decode("utf-8").strip()
 
-        with zf.open(path_prefix + "/data.pkl") as handle:
-            raw_model_data = torch.utils.show_pickle.DumpUnpickler(handle, catch_invalid_utf8=True).load()
-            model_data = hierarchical_pickle(raw_model_data)
+        def get_pickle(name):
+            assert path_prefix is not None
+            with zf.open(path_prefix + f"/{name}.pkl") as handle:
+                raw = torch.utils.show_pickle.DumpUnpickler(handle, catch_invalid_utf8=True).load()
+                return hierarchical_pickle(raw)
+
+        model_data = get_pickle("data")
+        constants = get_pickle("constants")
 
         # Intern strings that are likely to be re-used.
         # Pickle automatically detects shared structure,
@@ -318,6 +323,7 @@ def ist(s):
         interned_strings=list(interned_strings),
         code_files=code_files,
         model_data=model_data,
+        constants=constants,
         extra_files_jsons=extra_files_jsons,
         extra_pickles=extra_pickles,
     )}
diff --git a/torch/utils/model_dump/code.js b/torch/utils/model_dump/code.js
index 4afc6a026260e..65cea2438d372 100644
--- a/torch/utils/model_dump/code.js
+++ b/torch/utils/model_dump/code.js
@@ -88,15 +88,15 @@ function ModelSizeSection({model: {file_size, zip_files}}) {
     </pre><//>`;
 }
 
-function ModelStructureSection({model: {model_data}}) {
+function StructuredDataSection({name, data, shown}) {
   return html`
-    <${Hider} name="Model Structure" shown=true>
+    <${Hider} name=${name} shown=${shown}>
     <div style="font-family:monospace;">
-      <${ModelData} data=${model_data} indent="" prefix=""/>
+      <${StructuredData} data=${data} indent="" prefix=""/>
     </div><//>`;
 }
 
-class ModelData extends Component {
+class StructuredData extends Component {
   constructor() {
     super();
     this.state = { shown: false };
@@ -236,7 +236,7 @@ class ModelData extends Component {
       let parts = [];
       for (let idx = 0; idx < data.length; idx++) {
         // Does it make sense to put explicit index numbers here?
-        parts.push(html`<br/><${ModelData} prefix=${idx + ": "} indent=${new_indent} data=${data[idx]} />`);
+        parts.push(html`<br/><${StructuredData} prefix=${idx + ": "} indent=${new_indent} data=${data[idx]} />`);
       }
       return parts;
     }
@@ -251,7 +251,7 @@ class ModelData extends Component {
         if (typeof(data.keys[idx]) != "string") {
           parts.push(html`<br/>${new_indent}Non-string key`);
         } else {
-          parts.push(html`<br/><${ModelData} prefix=${data.keys[idx] + ": "} indent=${new_indent} data=${data.values[idx]} />`);
+          parts.push(html`<br/><${StructuredData} prefix=${data.keys[idx] + ": "} indent=${new_indent} data=${data.values[idx]} />`);
         }
       }
       return parts;
@@ -271,16 +271,16 @@ class ModelData extends Component {
           } else if (this.IGNORED_STATE_KEYS.has(mstate.keys[idx])) {
             // Do nothing.
           } else {
-            parts.push(html`<br/><${ModelData} prefix=${mstate.keys[idx] + ": "} indent=${new_indent} data=${mstate.values[idx]} />`);
+            parts.push(html`<br/><${StructuredData} prefix=${mstate.keys[idx] + ": "} indent=${new_indent} data=${mstate.values[idx]} />`);
           }
         }
       } else if (mstate.__tuple_values__) {
-        parts.push(html`<br/><${ModelData} prefix="" indent=${new_indent} data=${mstate} />`);
+        parts.push(html`<br/><${StructuredData} prefix="" indent=${new_indent} data=${mstate} />`);
       } else if (mstate.__module_type__) {
         // We normally wouldn't have the state of a module be another module,
         // but we use "modules" to encode special values (like Unicode decode
         // errors) that might be valid states.  Just go with it.
-        parts.push(html`<br/><${ModelData} prefix="" indent=${new_indent} data=${mstate} />`);
+        parts.push(html`<br/><${StructuredData} prefix="" indent=${new_indent} data=${mstate} />`);
       } else {
         throw new Error("Bad module state");
       }
@@ -639,7 +639,8 @@ class App extends Component {
         <h1>TorchScript Model (version ${model.version}): ${model.title}</h1>
         <button onClick=${() => console.log(model)}>Log Raw Model Info</button>
         <${ModelSizeSection} model=${model}/>
-        <${ModelStructureSection} model=${model}/>
+        <${StructuredDataSection} name="Model Data" data=${model.model_data} shown=true/>
+        <${StructuredDataSection} name="Constants" data=${model.constants} shown=false/>
         <${ZipContentsSection} model=${model}/>
         <${CodeSection} model=${model}/>
         <${ExtraJsonSection} files=${model.extra_files_jsons}/>

From 158d35151707d9214a6de33a1e72a1f9554e7cbb Mon Sep 17 00:00:00 2001
From: David Reiss <dreiss@fb.com>
Date: Sat, 10 Jul 2021 15:13:39 -0700
Subject: [PATCH 066/122] model_dump: Add webdriver test (#60701)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/60701

The unit test previously only tested that the dump could complete
successfully.  It was not able to verify that any JS worked properly.
Now we can test the JS as long as webdriver is installed.

Tweaked the implementation of Hider a bit to make it easier for tests to
find and open them.

I disabled the tests by default since I don't want to deal with
webdriver in CI.  Enable them with the environment variable
RUN_WEBDRIVER=1.

We could make the tests use headless mode, but it's kind of fun to watch
them run.

Add a test to verify that tensor memory computation is working for the
simple model.

Test Plan: Ran the test.

Reviewed By: dhruvbird

Differential Revision: D29380398

Pulled By: dreiss

fbshipit-source-id: f19d0b05d79ad5a8231e85422976f1889e021c89
---
 test/test_model_dump.py        | 60 ++++++++++++++++++++++++++++++++++
 torch/utils/model_dump/code.js |  5 +--
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/test/test_model_dump.py b/test/test_model_dump.py
index cb2397747b835..ad892ab275d5c 100644
--- a/test/test_model_dump.py
+++ b/test/test_model_dump.py
@@ -1,7 +1,10 @@
 #!/usr/bin/env python3
 import sys
+import os
 import io
+import functools
 import tempfile
+import urllib
 import unittest
 
 import torch
@@ -56,6 +59,27 @@ def forward(self, arg):
         return arg
 
 
+def webdriver_test(testfunc):
+    @functools.wraps(testfunc)
+    def wrapper(self, *args, **kwds):
+        self.needs_resources()
+
+        if os.environ.get("RUN_WEBDRIVER") != "1":
+            self.skipTest("Webdriver not requested")
+        from selenium import webdriver
+
+        for driver in [
+                "Firefox",
+                "Chrome",
+        ]:
+            with self.subTest(driver=driver):
+                wd = getattr(webdriver, driver)()
+                testfunc(self, wd, *args, **kwds)
+                wd.close()
+
+    return wrapper
+
+
 class TestModelDump(TestCase):
     def needs_resources(self):
         if sys.version_info < (3, 7):
@@ -74,6 +98,22 @@ def do_dump_model(self, model, extra_files=None):
         info = torch.utils.model_dump.get_model_info(buf)
         assert info is not None
 
+    def open_html_model(self, wd, model, extra_files=None):
+        buf = io.BytesIO()
+        torch.jit.save(model, buf, _extra_files=extra_files)
+        info = torch.utils.model_dump.get_model_info(buf)
+        skeleton = torch.utils.model_dump.get_inline_skeleton()
+        page = torch.utils.model_dump.burn_in_info(skeleton, info)
+        wd.get("data:text/html;charset=utf-8," + urllib.parse.quote(page))
+
+    def open_section_and_get_body(self, wd, name):
+        container = wd.find_element_by_xpath(f"//div[@data-hider-title='{name}']")
+        caret = container.find_element_by_class_name("caret")
+        if container.get_attribute("data-shown") != "true":
+            caret.click()
+        content = container.find_element_by_tag_name("div")
+        return content
+
     def test_scripted_model(self):
         model = torch.jit.script(SimpleModel())
         self.do_dump_model(model)
@@ -145,6 +185,26 @@ def test_invalid_json(self):
         model = torch.jit.script(SimpleModel())
         self.do_dump_model(model, extra_files={"foo.json": "{"})
 
+    @webdriver_test
+    def test_memory_computation(self, wd):
+        def check_memory(model, expected):
+            self.open_html_model(wd, model)
+            memory_table = self.open_section_and_get_body(wd, "Tensor Memory")
+            device = memory_table.find_element_by_xpath("//table/tbody/tr[1]/td[1]").text
+            self.assertEqual("cpu", device)
+            memory_usage_str = memory_table.find_element_by_xpath("//table/tbody/tr[1]/td[2]").text
+            self.assertEqual(expected, int(memory_usage_str))
+
+        simple_model_memory = (
+            # First layer, including bias.
+            64 * (16 + 1) +
+            # Second layer, including bias.
+            8 * (64 + 1)
+            # 32-bit float
+        ) * 4
+
+        check_memory(torch.jit.script(SimpleModel()), simple_model_memory)
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/utils/model_dump/code.js b/torch/utils/model_dump/code.js
index 65cea2438d372..57fe01eaa68cb 100644
--- a/torch/utils/model_dump/code.js
+++ b/torch/utils/model_dump/code.js
@@ -56,8 +56,9 @@ class Hider extends Component {
 
   render({name, children}, {shown}) {
     let my_caret = html`<span class=caret onClick=${() => this.click()} >${caret(shown)}</span>`;
-    return html`<h2>${my_caret} ${name}</h2>
-      ${shown ? this.props.children : []}`;
+    return html`<div data-hider-title=${name} data-shown=${shown}>
+      <h2>${my_caret} ${name}</h2>
+      <div>${shown ? this.props.children : []}</div></div>`;
   }
 
   click() {

From 7fdc5f9e08d78330afdef12761558b9a08c62667 Mon Sep 17 00:00:00 2001
From: David Reiss <dreiss@fb.com>
Date: Sat, 10 Jul 2021 15:13:39 -0700
Subject: [PATCH 067/122] model_dump: Fix non-counting and double-counting bugs
 in tensor memory (#60702)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/60702

- Instead of traversing and counting all tensor memory, collect a map
  from storage key to storage info while traversing.  Add up sizes at
  the end to avoid double counting.
- Count tensor memory from constants as well.

Test Plan: Ran webdriver test.

Reviewed By: dhruvbird

Differential Revision: D29380396

Pulled By: dreiss

fbshipit-source-id: 6d0fd66f677fe23c851aa218387aa4dc59502b1e
---
 test/test_model_dump.py        | 14 ++++++++
 torch/utils/model_dump/code.js | 61 +++++++++++++++++++++++-----------
 2 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/test/test_model_dump.py b/test/test_model_dump.py
index ad892ab275d5c..417bb2a91ad62 100644
--- a/test/test_model_dump.py
+++ b/test/test_model_dump.py
@@ -205,6 +205,20 @@ def check_memory(model, expected):
 
         check_memory(torch.jit.script(SimpleModel()), simple_model_memory)
 
+        # The same SimpleModel instance appears twice in this model.
+        # The tensors will be shared, so ensure no double-counting.
+        a_simple_model = SimpleModel()
+        check_memory(
+            torch.jit.script(
+                torch.nn.Sequential(a_simple_model, a_simple_model)),
+            simple_model_memory)
+
+        # The freezing process will move the weight and bias
+        # from data to constants.  Ensure they are still counted.
+        check_memory(
+            torch.jit.freeze(torch.jit.script(SimpleModel()).eval()),
+            simple_model_memory)
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/utils/model_dump/code.js b/torch/utils/model_dump/code.js
index 57fe01eaa68cb..c3ddd9a4c75d5 100644
--- a/torch/utils/model_dump/code.js
+++ b/torch/utils/model_dump/code.js
@@ -445,6 +445,13 @@ class OnePickleSection extends Component {
   }
 }
 
+function assertStorageAreEqual(key, lhs, rhs) {
+  if (lhs.length !== rhs.length ||
+    !lhs.every((val, idx) => val === rhs[idx])) {
+    throw new Error("Storage mismatch for key '" + key + "'");
+  }
+}
+
 function computeTensorMemory(numel, dtype) {
   const sizes = {
     "Byte": 1,
@@ -473,60 +480,74 @@ function computeTensorMemory(numel, dtype) {
 
 // TODO: Maybe track by dtype as well.
 // TODO: Maybe distinguish between visible size and storage size.
-// TODO: Maybe don't double-count if the model has
-// multiple references to the same submodule or tensor.
-function getTensorMemoryByDevice(data) {
+function getTensorStorages(data) {
   if (data === null) {
-    return {};
+    return new Map();
   }
   if (typeof(data) == "boolean") {
-    return {};
+    return new Map();
   }
   if (typeof(data) == "number") {
-    return {};
+    return new Map();
   }
   if (typeof(data) == "string") {
-    return {};
+    return new Map();
   }
   if (typeof(data) != "object") {
     throw new Error("Not an object");
   }
   if (Array.isArray(data)) {
-    let result = {};
+    let result = new Map();
     for (const item of data) {
-      const sizes = getTensorMemoryByDevice(item);
-      for (const [device, size] of Object.entries(sizes)) {
-        result[device] = (result[device] || 0) + size;
+      const tensors = getTensorStorages(item);
+      for (const [key, storage] of tensors.entries()) {
+        if (!result.has(key)) {
+          result.set(key, storage);
+        } else {
+          const old_storage = result.get(key);
+          assertStorageAreEqual(key, old_storage, storage);
+        }
       }
     }
     return result;
   }
   if (data.__tuple_values__) {
-    return getTensorMemoryByDevice(data.__tuple_values__);
+    return getTensorStorages(data.__tuple_values__);
   }
   if (data.__is_dict__) {
-    return getTensorMemoryByDevice(data.values);
+    return getTensorStorages(data.values);
   }
   if (data.__module_type__) {
-    return getTensorMemoryByDevice(data.state);
+    return getTensorStorages(data.state);
   }
   if (data.__tensor_v2__) {
     const [storage, offset, size, stride, grad] = data.__tensor_v2__;
     const [dtype, key, device, numel] = storage;
-    return {[device]: computeTensorMemory(numel, dtype)};
+    return new Map([[key, storage]]);
   }
   if (data.__qtensor__) {
     const [storage, offset, size, stride, quantizer, grad] = data.__qtensor__;
     const [dtype, key, device, numel] = storage;
-    return {[device]: computeTensorMemory(numel, dtype)};
+    return new Map([[key, storage]]);
   }
   throw new Error("Can't handle data type.", data);
 }
 
+function getTensorMemoryByDevice(data) {
+  const tensors = getTensorStorages(data);
+  let result = {};
+  for (const storage of tensors.values()) {
+    const [dtype, key, device, numel] = storage;
+    const size = computeTensorMemory(numel, dtype);
+    result[device] = (result[device] || 0) + size;
+  }
+  return result;
+}
+
 // Make this a separate component so it is rendered lazily.
 class OpenTensorMemorySection extends Component {
-  render({model_data}) {
-    let sizes = getTensorMemoryByDevice(model_data);
+  render({model: {model_data, constants}}) {
+    let sizes = getTensorMemoryByDevice([model_data, constants]);
     return html`
       <table>
         <thead>
@@ -547,10 +568,10 @@ class OpenTensorMemorySection extends Component {
   }
 }
 
-function TensorMemorySection({model: {model_data}}) {
+function TensorMemorySection({model}) {
   return html`
     <${Hider} name="Tensor Memory" shown=false>
-    <${OpenTensorMemorySection} model_data=${model_data} /><//>`;
+    <${OpenTensorMemorySection} model=${model} /><//>`;
 }
 
 class AuxContentPane extends Component {

From 7d7b7abb3baf44e37033ac2e1843ce6a07475ce1 Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Sat, 10 Jul 2021 16:58:25 -0700
Subject: [PATCH 068/122] [Static Runtime] Separate function for getting
 always_alive values (#61506)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61506

Separate out the logic of GetAlwaysAliveValues from GetLivenessMap so to simplify the code structure. Also you don't need to run GetLivenessMap if optimize_memory is turned off.

Reviewed By: ajyu

Differential Revision: D29423534

fbshipit-source-id: dbdeeb10f7bcad86a24aa12f741f7c9ab946bb3b
---
 torch/csrc/jit/runtime/static/impl.cpp | 88 +++++++++++++++-----------
 1 file changed, 50 insertions(+), 38 deletions(-)

diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 7924e1b957a2c..8a24b84e4b0f8 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -38,8 +38,8 @@ void OptimizeGraph(
   // to exposed folders.
 #ifdef FBCODE_CAFFE2
   if (opts.enable_out_variant) {
-    ReplaceWithCopy(graph);
     FuseListUnpack(graph);
+    ReplaceWithCopy(graph);
   }
 #endif
   ConstantPropagation(graph);
@@ -101,24 +101,53 @@ bool mayContainAlias(
   return db.mayContainAlias(as, bs);
 }
 
-// Returns two useful constructs:
-//  first: map each value to all values that are alive
-//    at the same time.
-//  second: set of all inputs/outputs/constants (always alive)
-//    and their aliases
+// Get set of all inputs/outputs/constants (always alive) and their aliases
+std::unordered_set<const Value*> GetAlwaysAliveValues(
+    const std::shared_ptr<torch::jit::Graph>& graph,
+    AliasDb& db) {
+  // a set of Values whose live-range exceed current inference
+  std::unordered_set<const Value*> always_alive;
+
+  // mark inputs, constants, outputs as always_alive
+  for (const auto* input : graph->inputs()) {
+    always_alive.insert(input);
+  }
+  for (const auto* output : graph->outputs()) {
+    always_alive.insert(output);
+  }
+  for (const auto* node : graph->nodes()) {
+    if (node->kind() == prim::Constant) {
+      for (const auto* output : node->outputs()) {
+        always_alive.insert(output);
+      }
+    }
+  }
+
+  // insert aliases of always live Values
+  for (const auto* node : graph->nodes()) {
+    // constants are already in the always_alive set
+    if (node->kind() != prim::Constant) {
+      for (const auto* v : node->outputs()) {
+        if (mayContainAlias(db, ValueSet{v}, always_alive)) {
+          always_alive.insert(v);
+        }
+      }
+    }
+  }
+  return always_alive;
+}
+
+//  Map each value to all values that are alive at the same time.
+using LivenessMap = std::unordered_map<const Value*, std::set<const Value*>>;
+
 //  The algorithm does a traversal of the execution graph
 //  while keeping track of the live values.
-using LivenessInformation = std::pair<
-    std::unordered_map<const Value*, std::set<const Value*>>,
-    std::unordered_set<const Value*>>;
-
-LivenessInformation GetLivenessInformation(
+LivenessMap GetLivenessMap(
     const std::shared_ptr<torch::jit::Graph>& graph,
+    const std::unordered_set<const Value*>& always_alive,
     AliasDb& db) {
   // map a Value to a set of Values that overlap live-ranges with the Value's
   std::unordered_map<const Value*, std::set<const Value*>> liveness_map;
-  // a set of Values whose live-range exceed current inference
-  std::unordered_set<const Value*> always_alive;
 
   // map Values to its creation order in graph (Note: only traverse top-level
   // nodes such that nodes under control-flows are represented by top-level
@@ -140,21 +169,6 @@ LivenessInformation GetLivenessInformation(
   // inputs)
   std::unordered_map<const Node*, std::set<const Value*>> live_nodes_def_chain;
 
-  // mark inputs, constants, outputs as always_alive
-  for (const auto* input : graph->inputs()) {
-    always_alive.insert(input);
-  }
-  for (const auto* output : graph->outputs()) {
-    always_alive.insert(output);
-  }
-  for (const auto* node : graph->nodes()) {
-    if (node->kind() == prim::Constant) {
-      for (const auto* output : node->outputs()) {
-        always_alive.insert(output);
-      }
-    }
-  }
-
   // add v to the current liveness_map
   std::function<void(const Value* v)> add_live_value_fn = [&](const Value* v) {
     if (liveness_map.count(v)) {
@@ -225,9 +239,7 @@ LivenessInformation GetLivenessInformation(
 
   for (const auto* node : graph->nodes()) {
     for (const auto* v : node->outputs()) {
-      if (mayContainAlias(db, ValueSet{v}, always_alive)) {
-        always_alive.insert(v);
-      } else {
+      if (always_alive.count(v) == 0) {
         add_live_value_fn(v);
       }
     }
@@ -254,7 +266,7 @@ LivenessInformation GetLivenessInformation(
     }
   }
 
-  return std::make_pair(liveness_map, always_alive);
+  return liveness_map;
 }
 
 // Collect the set of Values that are candidates for memory planning:
@@ -338,12 +350,11 @@ GetMemoryPlanningCandidates(const std::shared_ptr<torch::jit::Graph>& graph) {
 // and debug.
 std::unordered_map<const Value*, std::vector<const Value*>>
 GenerateSameStorageValues(
-    const LivenessInformation& lm,
+    const LivenessMap& alive_during,
+    const std::unordered_set<const Value*>& always_alive,
     const std::pair<std::vector<const Value*>, std::vector<const Value*>>&
         optimizable,
     AliasDb& db) {
-  const auto& alive_during = lm.first;
-  const auto& always_alive = lm.second;
   const auto& optimizable_values = optimizable.first;
   const auto& all_values = optimizable.second;
 
@@ -599,12 +610,13 @@ StaticModule::StaticModule(
 
   // Prepare for memory planning
   AliasDb alias_db(graph_);
-  auto lm = GetLivenessInformation(graph_, alias_db);
-  external_values_ = lm.second;
+  external_values_ = GetAlwaysAliveValues(graph_, alias_db);
+
   if (opts_.optimize_memory) {
+    auto lm = GetLivenessMap(graph_, external_values_, alias_db);
     auto values = GetMemoryPlanningCandidates(graph_);
     value_to_same_storage_values_ =
-        GenerateSameStorageValues(lm, values, alias_db);
+        GenerateSameStorageValues(lm, external_values_, values, alias_db);
   }
 }
 

From a46d4212bf47e46a00ebf33f6d8df997c8b6aff0 Mon Sep 17 00:00:00 2001
From: gmagogsfm <gmagogsfm@gmail.com>
Date: Sat, 10 Jul 2021 17:03:55 -0700
Subject: [PATCH 069/122] Allow dims=0 in torch.tensordot call (#61331)

Summary:
In one of my previous PRs that rewrite `tensordot` implementation, I mistakenly take empty value of `dims_a` and `dims_b` as illegal values. This turns out to be not true. Empty `dims_a` and `dims_b` are supported, in fact common when `dims` is passed as an integer. This PR removes the unnecessary check.

Fixes https://github.com/pytorch/pytorch/issues/61096

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61331

Reviewed By: eellison

Differential Revision: D29578910

Pulled By: gmagogsfm

fbshipit-source-id: 96e58164491a077ddc7a1d6aa6ccef8c0c9efda2
---
 test/test_linalg.py | 4 ++++
 torch/functional.py | 9 ++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/test/test_linalg.py b/test/test_linalg.py
index c005d2f202fd7..779d13264ce5b 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -7916,6 +7916,10 @@ def test_tensordot(self, device):
         cn = torch.from_numpy(np.tensordot(a.cpu().numpy(), b.cpu().numpy()))
         self.assertEqual(c, cn)
 
+        a = torch.tensordot(torch.tensor(0.), torch.tensor(0.), 0)
+        an = torch.from_numpy(np.tensordot(np.zeros((), dtype=np.float32), np.zeros((), dtype=np.float32), 0))
+        self.assertEqual(a, an)
+
 
 instantiate_device_type_tests(TestLinalg, globals())
 
diff --git a/torch/functional.py b/torch/functional.py
index acb32990d99dd..a435f727e96eb 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -926,6 +926,12 @@ def tensordot(a, b, dims=2, out: Optional[torch.Tensor] = None):  # noqa: F811
     if has_torch_function_variadic(a, b):
         return handle_torch_function(tensordot, (a, b), a, b, dims=dims)
 
+    if not isinstance(dims, (tuple, list, torch.Tensor, int)):
+        raise RuntimeError("tensordot expects dims to be int or "
+                           + "Tuple[List[int], List[int]] or "
+                           + "List[List[int]] containing two lists, but got "
+                           + f"dims={dims}")
+
     dims_a: List[int] = []
     dims_b: List[int] = []
 
@@ -951,9 +957,6 @@ def tensordot(a, b, dims=2, out: Optional[torch.Tensor] = None):  # noqa: F811
         dims_a = list(range(-dims, 0))
         dims_b = list(range(dims))
 
-    if len(dims_a) == 0 or len(dims_b) == 0:
-        raise RuntimeError(f"unsupported input to tensordot, got dims={dims}")
-
     if out is None:
         return _VF.tensordot(a, b, dims_a, dims_b)  # type: ignore[attr-defined]
     else:

From 0f6876d721eead82681fc6c33903c3d047117cc7 Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Sat, 10 Jul 2021 17:09:20 -0700
Subject: [PATCH 070/122] [Model Averaging] Create a post-localSGD
 communication hook (#61206)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61206

Create a communication hook to run post-local SGD. This will be combined with model averager component to better support local SGD.

In contrast to the previous approach that runs local gradient averaging + global model averaging at each step for the first K steps, now we plan to run global gradient averaging only for the first K steps at each step, just like normal DDP. This can give us two advantages:
1) For some optimizers, model averaging can cause discrepancy in optimizer states. If we still do global gradient averaging for the first K steps, we can defer such discrepancy until we actually start local SGD.
2) Gradient averaging at the first K steps only run one allreduce that overlaps with backward pass, so it should also be more efficient.

Proposal: https://github.com/pytorch/pytorch/issues/59699
ghstack-source-id: 133371322

Test Plan: buck test mode/dev-nosan caffe2/test/distributed:distributed_nccl_fork -- test_ddp_hook_parity_post_localSGD

Reviewed By: pritamdamania87

Differential Revision: D29523292

fbshipit-source-id: 3f215f7150f2917c2781278fad759530c685ea2c
---
 .../ddp_comm_hooks/post_localSGD_hook.py      | 94 +++++++++++++++++++
 .../_internal/distributed/distributed_test.py | 21 +++++
 2 files changed, 115 insertions(+)
 create mode 100644 torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py

diff --git a/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
new file mode 100644
index 0000000000000..ce43fbdbd2111
--- /dev/null
+++ b/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
@@ -0,0 +1,94 @@
+import logging
+
+import torch
+import torch.distributed as dist
+
+from . import default_hooks as default
+
+
+class PostLocalSGDState(object):
+    r"""
+    Stores the state for all-reducing gradients globally using ``process_group`` until step ``start_localSGD_iter``,
+    and all-reducing gradients locally using ``subgroup`` afterwards.
+    """
+
+    __slots__ = [
+        "process_group",
+        "subgroup",
+        "start_localSGD_iter",
+        "iter",
+    ]
+
+    def __init__(
+        self,
+        process_group,
+        subgroup,
+        start_localSGD_iter,
+    ):
+        logging.info(
+            "Local SGD will be started after {} iterations".format(start_localSGD_iter)
+        )
+
+        # The group used for all-reducing gradients globally.
+        self.process_group = process_group
+        # The group used for all-reducing gradients locally.
+        self.subgroup = subgroup
+        self.start_localSGD_iter = start_localSGD_iter
+        # Iteration/step in the training loop.
+        self.iter = 0
+
+    def maybe_increase_iter(self, bucket):
+        # Since bucket 0 is the last bucket to allreduce in an iteration.
+        # Only increase `iter` when bucket 0 is processed.
+        if bucket.is_the_last_bucket_to_allreduce():
+            self.iter += 1
+
+        if self.iter == self.start_localSGD_iter:
+            logging.info(
+                "Start to apply local SGD after {} iterations.".format(self.iter)
+            )
+
+
+def post_localSGD_hook(
+    state: PostLocalSGDState, bucket: dist.GradBucket
+) -> torch.futures.Future:
+    """
+    This DDP communication hook is used for running post-localSGD algorithm,
+    by combining with a model averaging component (e.g.,
+    :class:`~torch.distributed.algorithms.model_averaging.averagers.PeriodicModelAverager`)
+    that runs after the optimizer step.
+
+    Args:
+        state (PostLocalSGDState): State information to run post-localSGD.
+            Users mainly need to tune ``start_localSGD_iter`` to determine when to start local SGD.
+        bucket (dist.GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
+            Note that since DDP comm hook only supports single process single device mode at this time,
+            only exactly one tensor is stored in this bucket.
+
+    Returns:
+        Future handler of the communication, which updates the gradients in place.
+
+    Example::
+        >>> state = PostLocalSGDState(process_group=process_group, subgroup=subgroup,
+                                  start_localSGD_iter=10)
+        >>> ddp_model.register_comm_hook(state, post_localSGD_hook)
+        >>> # Also need to establish a model averaging module and run model averaging after ``optimizer.step()``.
+        >>> # Please refer to the examples in ``torch.distributed.algorithms.model_averaging.averagers`` module.
+    """
+    global_group_to_use = (
+        state.process_group if state.process_group is not None else dist.group.WORLD
+    )
+    world_size = global_group_to_use.size()
+
+    # The input tensor is a flattened 1D tensor.
+    input_tensor = bucket.get_tensor()
+
+    # Run allreduce using `global_group_to_use` in the first `start_localSGD_iter` iterations.
+    if state.iter < state.start_localSGD_iter:
+        state.maybe_increase_iter(bucket)
+        return default._allreduce_fut(global_group_to_use, input_tensor)
+
+    # Run allreduce using `subgroup` after the first `start_localSGD_iter` iterations.
+    # From this moment, model averaging should run after the optimizer step,
+    # to globally allreduce all the parameters.
+    return default._allreduce_fut(state.subgroup, input_tensor)
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index d5b06bb5530e1..b2529bdbfc474 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -17,6 +17,7 @@
 import torch.cuda
 import torch.distributed as dist
 import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD
+import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
 import torch.distributed.algorithms.model_averaging.averagers as averagers
 import torch.distributed.algorithms.model_averaging.utils as model_averaging_utils
 import torch.nn as nn
@@ -59,6 +60,7 @@
     IS_WINDOWS,
     FILE_SCHEMA,
     IS_FBCODE,
+    NO_MULTIPROCESSING_SPAWN,
 )
 from torch.utils.data.distributed import DistributedSampler
 
@@ -3895,6 +3897,25 @@ def test_ddp_hook_parity_powerSGD(self):
                     state=powersgd_state, hook=powerSGD.powerSGD_hook
                 )
 
+        @unittest.skipIf(
+            BACKEND != "nccl" and BACKEND != "gloo",
+            "MPI backend does not support DDP communication hook on CUDA devices",
+        )
+        @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
+                         don't support multiprocessing with spawn start method")
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        @skip_if_rocm
+        def test_ddp_hook_parity_post_localSGD(self):
+            # Although we start run local SGD at iteration 10, since we still use the global process group to run it,
+            # the post-LocalSGD actually still allreduces gradients globally for the remaining iterations.
+            state = post_localSGD.PostLocalSGDState(process_group=None, subgroup=dist.group.WORLD, start_localSGD_iter=10)
+            self._test_ddp_hook_parity(state=state, hook=post_localSGD.post_localSGD_hook)
+
+            # Since we start local SGD later than the total number of 100 iterations,
+            # no local SGD actually is executed, and we don't even need to provide a subgroup for this case.
+            state = post_localSGD.PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=1000)
+            self._test_ddp_hook_parity(state=state, hook=post_localSGD.post_localSGD_hook)
+
         def _prepare_single_device_module(
             self,
             rank,

From df00c636d2582f7f7cc2fb885b69528734cdafb5 Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Sat, 10 Jul 2021 17:09:20 -0700
Subject: [PATCH 071/122] [Model Averaging] Skip model averaging for the first
 K steps (#61207)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61207

Model averager now must be combined with post-localSGD DDP communication hook. It will skip model averaging for the first K steps, because post-localSGD communication hook will run global gradient averaging during this phase.

Proposal: https://github.com/pytorch/pytorch/issues/59699
ghstack-source-id: 133371335

Test Plan: buck test mode/dev-nosan caffe2/test/distributed:distributed_nccl_fork -- test_periodic_model_averager

Reviewed By: pritamdamania87

Differential Revision: D29523738

fbshipit-source-id: 3fa9611046e1c0afa4bda78aa3ba200fa2a5fa4b
---
 .../algorithms/model_averaging/averagers.py   | 24 +++++++++++--------
 .../_internal/distributed/distributed_test.py | 22 +++++++++--------
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/torch/distributed/algorithms/model_averaging/averagers.py b/torch/distributed/algorithms/model_averaging/averagers.py
index f93dea37b46f3..0cb5be6d40392 100644
--- a/torch/distributed/algorithms/model_averaging/averagers.py
+++ b/torch/distributed/algorithms/model_averaging/averagers.py
@@ -6,9 +6,9 @@
 
 class PeriodicModelAverager:
     r"""
-    Averages parameters periodically or during the warm-up stage.
+    Averages parameters periodically after the warm-up stage.
 
-    This can be used for running `post-local SDG <https://arxiv.org/abs/1808.07217>`_,
+    This can be used for running `post-local SGD <https://arxiv.org/abs/1808.07217>`_,
     by running :class:`~torch.nn.DistributedDataParallel` (DDP)
     using the subgroups created by :meth:`~torch.distributed.new_subgroups`.
 
@@ -18,7 +18,7 @@ class PeriodicModelAverager:
                       Usually the period should be greater than ``1`` to reduce the communication cost.
                       Otherwise, only DDP needs to be used.
         warmup_steps (int): The number of warm-up steps. During this stage,
-                            ``period`` is viewed as 1, and the parameters are averaged at every step.
+                            model averaging is skipped.
         process_group: The process group to be used for all-reduce.
                        If ``None``, the default process group, which
                        is created by :func:`torch.distributed.init_process_group`,
@@ -28,20 +28,24 @@ class PeriodicModelAverager:
 
         >>>  import torch
         >>>  import torch.distributed as dist
+        >>>  import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
         >>>  import torch.distributed.algorithms.model_averaging.averagers as averagers
         >>>  import torch.nn as nn
         >>>
         >>>  dist.init_process_group("nccl", rank=rank, world_size=16)
         >>>  torch.cuda.set_device(rank)
         >>>  module = nn.Linear(1, 1, bias=False).to(rank)
-        >>>  subgroup, subgroups = dist.new_subgroups()
-        >>>  # Gradients are averaged by each intra-node subgroup during the backward pass.
         >>>  model = nn.parallel.DistributedDataParallel(
-        >>>     module, device_ids=[rank], output_device=rank, process_group=subgroup
+        >>>     module, device_ids=[rank], output_device=rank
         >>>  )
+        >>>  # Register a post-localSGD communication hook.
+        >>>  subgroup, subgroups = dist.new_subgroups()
+        >>>  state = PostLocalSGDState(subgroup=subgroup, start_localSGD_iter=100)
+        >>>  model.register_comm_hook(state, post_localSGD_hook)
         >>>
-        >>>  # In the first 100 steps, run model averaging every step.
+        >>>  # In the first 100 steps, run global gradient averaging like normal DDP at every step.
         >>>  # After 100 steps, run model averaging every 4 steps.
+        >>>  # Note that ``warmup_steps`` must be the same as ``start_localSGD_iter`` used in ``PostLocalSGDState``.
         >>>  averager = averagers.PeriodicModelAverager(model, warmup_steps=100, period=4)
         >>>  for step in range(0, 20):
         >>>     optimizer.zero_grad()
@@ -84,10 +88,10 @@ def __init__(
 
     def average_parameters(self):
         r"""
-        Averages parameters if ``step`` is less than ``warmup_steps``,
-        or it can be divided by ``period``, where ``step`` is increased by 1
+        Averages parameters if ``step`` is no less than ``warmup_steps``
+        and it can be divided by ``period``, where ``step`` is increased by 1
         at each iteration in the training loop.
         """
-        if self.step < self.warmup_steps or self.step % self.period == 0:
+        if self.step >= self.warmup_steps and (self.step - self.warmup_steps) % self.period == 0:
             utils.average_parameters(self.module, self.process_group)
         self.step += 1
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index b2529bdbfc474..6b3a50ba354f7 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -1010,16 +1010,18 @@ def test_periodic_model_averager(self):
             param = next(model.parameters())
             tensor = torch.ones_like(param.data) * rank
             expected_avg_tensor = torch.ones_like(param.data) * sum(range(world_size)) / world_size
-            averager = averagers.PeriodicModelAverager(model, warmup_steps=10, period=4)
-            for step in range(0, 20):
-                # Reset the parameters at every step.
-                param.data = copy.deepcopy(tensor)
-                averager.average_parameters()
-                if step < 10 or step % 4 == 0:
-                    self.assertEqual(param.data, expected_avg_tensor)
-                else:
-                    # No model averaging, so the parameters are not updated.
-                    self.assertEqual(param.data, tensor)
+            period = 4
+            for warmup_steps in [12, 13, 14, 15]:
+                averager = averagers.PeriodicModelAverager(model, warmup_steps=warmup_steps, period=period)
+                for step in range(0, 20):
+                    # Reset the parameters at every step.
+                    param.data = copy.deepcopy(tensor)
+                    averager.average_parameters()
+                    if step >= warmup_steps and (step - warmup_steps) % period == 0:
+                        self.assertEqual(param.data, expected_avg_tensor)
+                    else:
+                        # No model averaging, so the parameters are not updated.
+                        self.assertEqual(param.data, tensor)
 
         # NCCL Batch SEND RECV
         @skip_if_no_gpu

From 5a17cb6f4415cfc57dabbf408e42b7675c66a64a Mon Sep 17 00:00:00 2001
From: Xiao Wang <24860335+xwang233@users.noreply.github.com>
Date: Sat, 10 Jul 2021 17:59:50 -0700
Subject: [PATCH 072/122] Add channels-last support for bilinear and nearest 2d
 interpolation on CUDA (#56322)

Summary:
Add channels-last support for bilinear and nearest 2d interpolation on CUDA

Benchmark (on 2070 Super) is available at

- nearest 2d: https://github.com/xwang233/code-snippet/tree/master/interpolate-channels-last/nearest-2d
- bilinear: https://github.com/xwang233/code-snippet/tree/master/interpolate-channels-last/bilinear

Some regressions are seen for tensors with small channel size. We may add a heuristic to dispatch the contiguous and channels-last path if needed.

Close https://github.com/pytorch/pytorch/issues/60137

Pull Request resolved: https://github.com/pytorch/pytorch/pull/56322

Reviewed By: mruberry

Differential Revision: D29645980

Pulled By: ngimel

fbshipit-source-id: c36dff4ee4789bec9b01da4029f326d30067c6b7
---
 aten/src/ATen/native/UpSampleBilinear2d.cpp   |   2 +-
 aten/src/ATen/native/UpSampleNearest2d.cpp    |   2 +-
 aten/src/ATen/native/cuda/KernelUtils.cuh     |  18 +
 .../ATen/native/cuda/UpSampleBilinear2d.cu    | 349 +++++++++++++----
 .../src/ATen/native/cuda/UpSampleNearest2d.cu | 356 +++++++++++++-----
 test/test_nn.py                               | 168 +++++----
 6 files changed, 646 insertions(+), 249 deletions(-)

diff --git a/aten/src/ATen/native/UpSampleBilinear2d.cpp b/aten/src/ATen/native/UpSampleBilinear2d.cpp
index 7398766a99fd3..bd57b67592657 100644
--- a/aten/src/ATen/native/UpSampleBilinear2d.cpp
+++ b/aten/src/ATen/native/UpSampleBilinear2d.cpp
@@ -44,7 +44,7 @@ TORCH_META_FUNC(upsample_bilinear2d_backward) (
         " but got grad_output.size(", i, ") = ", grad_output.size(i));
   }
 
-  set_output(input_size, grad_output.options());
+  set_output(input_size, grad_output.options().memory_format(grad_output.suggest_memory_format()));
 }
 
 } // namespace meta
diff --git a/aten/src/ATen/native/UpSampleNearest2d.cpp b/aten/src/ATen/native/UpSampleNearest2d.cpp
index a011ca3824388..47914eb9a2fa7 100644
--- a/aten/src/ATen/native/UpSampleNearest2d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest2d.cpp
@@ -41,7 +41,7 @@ TORCH_META_FUNC(upsample_nearest2d_backward) (
         " but got grad_output.size(", i, ") = ", grad_output.size(i));
   }
 
-  set_output(input_size, grad_output.options());
+  set_output(input_size, grad_output.options().memory_format(grad_output.suggest_memory_format()));
 }
 
 } // namespace meta
diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh
index a76a0194910a3..f600ad4bbe7a8 100644
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@@ -5,6 +5,24 @@
 namespace at {
 namespace native {
 
+__device__ __forceinline__ size_t
+idx(const size_t nc,
+    const size_t height,
+    const size_t width,
+    const size_t h,
+    const size_t w) {
+  return (nc * height + h) * width + w;
+}
+
+// for channels-last
+__device__ __forceinline__ size_t
+idx_cl(
+  const size_t n, const size_t h, const size_t w, const size_t c,
+  const size_t height, const size_t width, const size_t channel
+) {
+  return ((n * height + h) * width + w) * channel + c;
+}
+
 template <
     typename scalar_t,
     typename index_t,
diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
index b04456d242f58..24df72f9a8f77 100644
--- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
@@ -9,20 +9,12 @@
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/native/cuda/UpSample.cuh>
 #include <ATen/native/cuda/KernelUtils.cuh>
+#include <ATen/cuda/detail/KernelUtils.h>
 
 namespace at {
 namespace native {
 namespace {
 
-__device__ __forceinline__ size_t
-idx(const size_t nc,
-    const size_t height,
-    const size_t width,
-    const size_t y,
-    const size_t x) {
-  return (nc * height + y) * width + x;
-}
-
 template <typename scalar_t, typename accscalar_t>
 C10_LAUNCH_BOUNDS_1(1024)
 __global__ void upsample_bilinear2d_out_frame(
@@ -44,19 +36,7 @@ __global__ void upsample_bilinear2d_out_frame(
   if (index < n) {
     const int w2 = index % width2; // 0:width2-1
     const int h2 = index / width2; // 0:height2-1
-    // special case: just copy
-    if (height1 == height2 && width1 == width2) {
-      const int h1 = h2;
-      const int w1 = w2;
-      for (int n = 0; n < batchsize; n++) {
-        for (int c = 0; c < channels; ++c) {
-          const scalar_t val = idata[n][c][h1][w1];
-          odata[n][c][h2][w2] = val;
-        }
-      }
-      return;
-    }
-    //
+
     const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
         rheight, h2, align_corners, /*cubic=*/false);
     const int h1 = h1r;
@@ -85,6 +65,55 @@ __global__ void upsample_bilinear2d_out_frame(
   }
 }
 
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_bilinear2d_nhwc_out_frame(
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    const int batchsize,
+    const int channels,
+    const int height1,
+    const int width1,
+    const int height2,
+    const int width2,
+    const scalar_t* idata,
+    scalar_t* odata,
+    const int out_numel) {
+
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < out_numel) {
+    const int c = index % channels;
+    const int w2 = (index / channels) % width2;
+    const int h2 = (index / channels / width2) % height2;
+    const int n = index / channels / width2 / height2;
+
+    const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+        rheight, h2, align_corners, /*cubic=*/false);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const accscalar_t h1lambda = h1r - h1;
+    const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+
+    const accscalar_t val = h0lambda * (
+        w0lambda * idata[idx_cl(n, h1, w1, c, height1, width1, channels)] +
+        w1lambda * idata[idx_cl(n, h1, w1 + w1p, c, height1, width1, channels)]
+      ) + h1lambda * (
+        w0lambda * idata[idx_cl(n, h1 + h1p, w1, c, height1, width1, channels)] +
+        w1lambda * idata[idx_cl(n, h1 + h1p, w1 + w1p, c, height1, width1, channels)]
+      );
+    odata[idx_cl(n, h2, w2, c, height2, width2, channels)] = static_cast<scalar_t>(val);
+  }
+}
+
 // Backward (adjoint) operation 1 <- 2 (accumulates)
 template <typename scalar_t, typename accscalar_t>
 C10_LAUNCH_BOUNDS_1(1024)
@@ -151,6 +180,73 @@ __global__ void upsample_bilinear2d_backward_out_frame(
   }
 }
 
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_bilinear2d_backward_nhwc_out_frame(
+    const size_t nc,
+    const int height1,
+    const int width1,
+    const int height2,
+    const int width2,
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    scalar_t* __restrict__ idata,
+    const scalar_t* __restrict__ odata,
+    const int channels,
+    const size_t o_numel,
+    const size_t i_numel) {
+
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < o_numel) {
+    const int c = index % channels;
+    const int w2 = (index / channels) % width2;
+    const int h2 = (index / channels / width2) % height2;
+    const int n = index / channels / width2 / height2;
+
+    const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+        rheight, h2, align_corners, /*cubic=*/false);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const accscalar_t h1lambda = h1r - h1;
+    const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+
+    const scalar_t d2val = odata[index];
+    fastAtomicAdd(
+        idata,
+        idx_cl(n, h1, w1, c, height1, width1, channels),
+        i_numel,
+        static_cast<scalar_t>(h0lambda * w0lambda * d2val),
+        true);
+    fastAtomicAdd(
+        idata,
+        idx_cl(n, h1, w1 + w1p, c, height1, width1, channels),
+        i_numel,
+        static_cast<scalar_t>(h0lambda * w1lambda * d2val),
+        true);
+    fastAtomicAdd(
+        idata,
+        idx_cl(n, h1 + h1p, w1, c, height1, width1, channels),
+        i_numel,
+        static_cast<scalar_t>(h1lambda * w0lambda * d2val),
+        true);
+    fastAtomicAdd(
+        idata,
+        idx_cl(n, h1 + h1p, w1 + w1p, c, height1, width1, channels),
+        i_numel,
+        static_cast<scalar_t>(h1lambda * w1lambda * d2val),
+        true);
+  }
+}
+
 static void upsample_bilinear2d_out_cuda_template(
     const Tensor& output,
     const Tensor& input,
@@ -169,31 +265,84 @@ static void upsample_bilinear2d_out_cuda_template(
   int input_height = input.size(2);
   int input_width = input.size(3);
 
-  const int num_kernels = output_height * output_width;
-  const int num_threads = std::min(
-      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const auto memory_format = input.suggest_memory_format();
+
+  if (input.sizes() == output.sizes()) {
+    output.copy_(input);
+    return;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "upsample_bilinear2d_out_frame", [&] {
+    // heuristic: only use channels_last path when it's faster than the contiguous path
+    if (memory_format == at::MemoryFormat::ChannelsLast && channels >= 16 && \
+          output.is_contiguous(memory_format)) {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+
+      TORCH_CHECK(input.numel() < std::numeric_limits<int>::max(),
+        "upsample_bilinear2d_nhwc only supports input tensors with less than INT_MAX elements");
+      TORCH_CHECK(output.numel() < std::numeric_limits<int>::max(),
+        "upsample_bilinear2d_nhwc only supports output tensors with less than INT_MAX elements");
+
+      const int batchsize = input.size(0);
+      const int channels = input.size(1);
+      const int height1 = input.size(2);
+      const int width1 = input.size(3);
+      const int height2 = output.size(2);
+      const int width2 = output.size(3);
+
+      // const int num_kernels = output_height * output_width;
+      const int num_kernels = output.numel();
+      const int num_threads = std::min(
+          at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+
+      at::Tensor input_cl = input.contiguous(at::MemoryFormat::ChannelsLast);
+
+      const scalar_t* idata = input_cl.data_ptr<scalar_t>();
+      scalar_t* odata = output.data_ptr<scalar_t>();
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "upsample_bilinear2d_out_frame", [&] {
-        using accscalar_t = at::acc_type<scalar_t, true>;
-
-        auto idata = input.packed_accessor64<scalar_t, 4>();
-        auto odata = output.packed_accessor64<scalar_t, 4>();
-
-        const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
-            input_height, output_height, align_corners, scales_h);
-        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
-            input_width, output_width, align_corners, scales_w);
-
-        upsample_bilinear2d_out_frame<scalar_t, accscalar_t>
-            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
-               num_threads,
-               0,
-               stream>>>(
-                num_kernels, rheight, rwidth, align_corners, idata, odata);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
+      const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+          input_height, output_height, align_corners, scales_h);
+      const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+          input_width, output_width, align_corners, scales_w);
+
+      upsample_bilinear2d_nhwc_out_frame<scalar_t, accscalar_t>
+        <<<cuda::ATenCeilDiv(num_kernels, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+          rheight, rwidth, align_corners,
+          batchsize,
+          channels,
+          height1,
+          width1,
+          height2,
+          width2,
+          idata, odata,
+          output.numel());
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    } else {
+      // non-channels_last case, not necessarily contiguous
+      const int num_kernels = output_height * output_width;
+      const int num_threads = std::min(
+          at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+      cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+      using accscalar_t = at::acc_type<scalar_t, true>;
+
+      auto idata = input.packed_accessor64<scalar_t, 4>();
+      auto odata = output.packed_accessor64<scalar_t, 4>();
+
+      const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+          input_height, output_height, align_corners, scales_h);
+      const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+          input_width, output_width, align_corners, scales_w);
+
+      upsample_bilinear2d_out_frame<scalar_t, accscalar_t>
+          <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+             num_threads,
+             0,
+             stream>>>(
+              num_kernels, rheight, rwidth, align_corners, idata, odata);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    }
+  });
 }
 
 static void upsample_bilinear2d_backward_out_cuda_template(
@@ -216,14 +365,12 @@ static void upsample_bilinear2d_backward_out_cuda_template(
   int input_height = input_size[2];
   int input_width = input_size[3];
 
-  Tensor grad_output = grad_output_.contiguous();
-
   if (grad_input.numel() == 0) {
     return;
   }
 
-  // A contiguous tensor is required for the kernel launch config
-  grad_input.contiguous();
+  const auto memory_format = grad_output_.suggest_memory_format();
+
   // initialization to zero is required here. As we launch one thread per output
   // element, and atomicAdd to input gradient. Given a sparse sampling case, our
   // threads are not covering the whole input tensor.
@@ -234,35 +381,79 @@ static void upsample_bilinear2d_backward_out_cuda_template(
       at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      grad_output.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] {
-        using accscalar_t = at::acc_type<scalar_t, true>;
-
-        auto idata = grad_input.data_ptr<scalar_t>();
-        auto odata = grad_output.data_ptr<scalar_t>();
-
-        const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
-            input_height, output_height, align_corners, scales_h);
-        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
-            input_width, output_width, align_corners, scales_w);
-
-        upsample_bilinear2d_backward_out_frame<scalar_t, accscalar_t>
-            <<<cuda::ATenCeilDiv(num_kernels, static_cast<size_t>(num_threads)),
-               num_threads,
-               0,
-               stream>>>(
-                nbatch * channels,
-                input_height,
-                input_width,
-                output_height,
-                output_width,
-                rheight,
-                rwidth,
-                align_corners,
-                idata,
-                odata);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
+  if (grad_output_.sizes() == grad_input.sizes()) {
+    grad_input.copy_(grad_output_);
+    return;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad_output_.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] {
+    if (memory_format == at::MemoryFormat::ChannelsLast && channels >= 4 && \
+          grad_input.is_contiguous(memory_format)) {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+
+      Tensor grad_output = grad_output_.contiguous(at::MemoryFormat::ChannelsLast);
+
+      auto idata = grad_input.data_ptr<scalar_t>();
+      auto odata = grad_output.data_ptr<scalar_t>();
+
+      const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+          input_height, output_height, align_corners, scales_h);
+      const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+          input_width, output_width, align_corners, scales_w);
+
+      upsample_bilinear2d_backward_nhwc_out_frame<scalar_t, accscalar_t>
+          <<<cuda::ATenCeilDiv(num_kernels, static_cast<size_t>(num_threads)), num_threads, 0, stream>>>(
+              nbatch * channels,
+              input_height,
+              input_width,
+              output_height,
+              output_width,
+              rheight,
+              rwidth,
+              align_corners,
+              idata,
+              odata,
+              channels,
+              grad_output.numel(),
+              grad_input.numel());
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    } else {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+
+      // This is needed for non-contiguous tensors.
+      Tensor grad_input_c = grad_input.is_contiguous() ? grad_input : at::zeros(grad_input.sizes(), grad_input.options());
+      Tensor grad_output = grad_output_.contiguous();
+
+      auto idata = grad_input_c.data_ptr<scalar_t>();
+      auto odata = grad_output.data_ptr<scalar_t>();
+
+      const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+          input_height, output_height, align_corners, scales_h);
+      const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+          input_width, output_width, align_corners, scales_w);
+
+      upsample_bilinear2d_backward_out_frame<scalar_t, accscalar_t>
+          <<<cuda::ATenCeilDiv(num_kernels, static_cast<size_t>(num_threads)),
+             num_threads,
+             0,
+             stream>>>(
+              nbatch * channels,
+              input_height,
+              input_width,
+              output_height,
+              output_width,
+              rheight,
+              rwidth,
+              align_corners,
+              idata,
+              odata);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+      if (!grad_input.is_contiguous()) {
+          grad_input.copy_(grad_input_c);
+      }
+    }
+  });
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
index b893cf8d1a529..4a4edf6b3290d 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
@@ -7,6 +7,8 @@
 #include <ATen/native/cuda/LaunchUtils.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/native/cuda/UpSample.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
+#include <ATen/cuda/detail/KernelUtils.h>
 
 namespace at {
 namespace native {
@@ -58,6 +60,35 @@ __global__ void upsample_nearest2d_out_frame(
   }
 }
 
+template <typename scalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest2d_nhwc_out_frame(
+    const scalar_t* idata,
+    scalar_t* odata,
+    const size_t channels,
+    const size_t height1,
+    const size_t width1,
+    const size_t height2,
+    const size_t width2,
+    float height_scale,
+    float width_scale,
+    const size_t out_numel) {
+
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < out_numel) {
+    const int c = index % channels;
+    const int w2 = (index / channels) % width2;
+    const int h2 = (index / channels / width2) % height2;
+    const int n = index / channels / width2 / height2;
+
+    const size_t h1 = height1 == height2 ? h2 : nearest_neighbor_compute_source_index(height_scale, h2, height1);
+    const size_t w1 = width1 == width2 ? w2 : nearest_neighbor_compute_source_index(width_scale, w2, width1);
+
+    odata[index] = idata[idx_cl(n, h1, w1, c, height1, width1, channels)];
+  }
+}
+
 // see NOTE [ Nearest neighbor upsampling kernel implementation ]
 template <typename scalar_t, typename accscalar_t>
 C10_LAUNCH_BOUNDS_1(1024)
@@ -111,6 +142,47 @@ __global__ void upsample_nearest2d_backward_out_frame(
   }
 }
 
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest2d_backward_nhwc_out_frame(
+    const scalar_t* go,
+    scalar_t* gi,
+    const size_t height1,
+    const size_t width1,
+    const size_t height2,
+    const size_t width2,
+    const size_t channels,
+    const float height_scale,
+    const float width_scale,
+    const size_t gi_numel) {
+
+  // 1 is for grad_output (src)
+  // 2 is for grad_input (dst)
+
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < gi_numel) {
+    const int c = index % channels;
+    const int w2 = (index / channels) % width2;
+    const int h2 = (index / channels / width2) % height2;
+    const int n = index / channels / width2 / height2;
+
+    int h1 = nearest_neighbor_bw_compute_source_index(height_scale, h2, height1);
+    int h1_up = nearest_neighbor_bw_compute_source_index(height_scale, h2 + 1, height1);
+
+    int w1 = nearest_neighbor_bw_compute_source_index(width_scale, w2, width1);
+    int w1_up = nearest_neighbor_bw_compute_source_index(width_scale, w2 + 1, width1);
+
+    accscalar_t grad = 0;
+    for (int ih = h1; ih < h1_up; ih++) {
+      for (int iw = w1; iw < w1_up; iw++) {
+        grad += go[idx_cl(n, ih, iw, c, height1, width1, channels)];
+      }
+    }
+    gi[index] = static_cast<scalar_t>(grad);
+  }
+}
+
 static void upsample_nearest2d_out_cuda_template(
     const Tensor& output,
     const Tensor& input_,
@@ -120,10 +192,9 @@ static void upsample_nearest2d_out_cuda_template(
   TensorArg input_arg{input_, "input_", 1}, output_arg{output, "output", 2};
   checkAllSameGPU(__func__, {input_arg, output_arg});
 
-  // TODO: remove this when the cuda kernel is updated to support the channels_last memory format.
-  // This is a temporary hack to prevent a silence correctness issue when calling this kernel
-  // with tensors in channels_last format.
-  auto output_c = output.is_contiguous() ? output : at::empty(output.sizes(), output.options());
+  if (input_.numel() == 0) {
+    return;
+  }
 
   int output_height = output_size[0];
   int output_width = output_size[1];
@@ -133,70 +204,110 @@ static void upsample_nearest2d_out_cuda_template(
   int input_height = input_.size(2);
   int input_width = input_.size(3);
 
-  Tensor input = input_.contiguous();
+  const float height_scale = compute_scales_value<float>(scales_h, input_height, output_height);
+  const float width_scale = compute_scales_value<float>(scales_w, input_width, output_width);
 
-  if (input.numel() == 0) {
+  const auto memory_format = input_.suggest_memory_format();
+
+  if (input_.sizes() == output.sizes()) {
+    output.copy_(input_);
     return;
   }
 
-  int nc = nbatch * channels;
-
-  const int max_threads = std::min<int>(
-      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, MAX_THREADS);
-
-  int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim;
-  int* maxGridSize = at::cuda::getCurrentDeviceProperties()->maxGridSize;
-
-  // upsample_nearest2d meta call makes sure input/output tensor is not empty;
-  int block_x = std::min<int>(
-      maxThreadsDim[0], std::min<int>(lastPow2(output_width), max_threads));
-  int block_y = std::min<int>(
-      maxThreadsDim[1],
-      std::min<int>(lastPow2(output_height), max_threads / block_x));
-  int block_z = std::min<int>(
-      maxThreadsDim[2], std::min<int>(nc, max_threads / block_x / block_y));
-  const dim3 block(block_x, block_y, block_z);
-
-  int grid_x = cuda::ATenCeilDiv(output_width, block_x);
-  int grid_y = cuda::ATenCeilDiv(output_height, block_y);
-  int grid_z = std::min<int>(
-      maxGridSize[2], cuda::ATenCeilDiv(nc, block_z * 4));
-  const dim3 grid(grid_x, grid_y, grid_z);
-  // Error out on cases where grid_x & grid_y exceeds limit of launch config, as
-  // the current kernel implementation doesn't loop over the two dimensions.
-  // This is unlikely to happen.
-  // TODO: kernel implementation could stride on spatial dimension. We probably
-  //       need to overhaul the kernel.
-  TORCH_CHECK(
-      grid_x <= maxGridSize[0] && grid_y <= maxGridSize[1],
-      "input tensor has spatial dimension larger than the kernel capacity");
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_out_frame", [&] {
-        using accscalar_t = at::acc_type<scalar_t, true>;
-
-        auto idata = input.data_ptr<scalar_t>();
-        auto odata = output_c.data_ptr<scalar_t>();
-
-        const float height_scale = compute_scales_value<float>(scales_h, input_height, output_height);
-        const float width_scale = compute_scales_value<float>(scales_w, input_width, output_width);
-
-        upsample_nearest2d_out_frame<scalar_t, accscalar_t>
-            <<<grid, block, 0, stream>>>(
-                idata,
-                odata,
-                nc,
-                input_height,
-                input_width,
-                output_height,
-                output_width,
-                height_scale,
-                width_scale);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-
-  if (!output.is_contiguous()) {
-      output.copy_(output_c);
+  // heuristic: only use channels_last path when it's faster than the contiguous path
+  if (memory_format == at::MemoryFormat::ChannelsLast && channels >= 4 && \
+        output.is_contiguous(memory_format)) {
+    at::Tensor input = input_.contiguous(at::MemoryFormat::ChannelsLast);
+
+    TORCH_CHECK(input.numel() < std::numeric_limits<int>::max(),
+      "upsample_nearest_nhwc only supports input tensors with less than INT_MAX elements");
+    TORCH_CHECK(output.numel() < std::numeric_limits<int>::max(),
+      "upsample_nearest_nhwc only supports output tensors with less than INT_MAX elements");
+
+    const int num_kernels = output.numel();
+    const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_nhwc_out_frame", [&] {
+      const scalar_t* idata = input.data_ptr<scalar_t>();
+      scalar_t* odata = output.data_ptr<scalar_t>();
+
+      upsample_nearest2d_nhwc_out_frame<scalar_t>
+        <<<cuda::ATenCeilDiv(num_kernels, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+          idata,
+          odata,
+          channels,
+          input_height,
+          input_width,
+          output_height,
+          output_width,
+          height_scale,
+          width_scale,
+          output.numel()
+      );
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+  }
+  else {
+    // This is needed for non-contiguous tensors.
+    Tensor output_c = output.is_contiguous() ? output : at::empty(output.sizes(), output.options());
+    Tensor input = input_.contiguous();
+
+    int nc = nbatch * channels;
+
+    const int max_threads = std::min<int>(
+        at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, MAX_THREADS);
+
+    int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim;
+    int* maxGridSize = at::cuda::getCurrentDeviceProperties()->maxGridSize;
+
+    // upsample_nearest2d meta call makes sure input/output tensor is not empty;
+    int block_x = std::min<int>(
+        maxThreadsDim[0], std::min<int>(lastPow2(output_width), max_threads));
+    int block_y = std::min<int>(
+        maxThreadsDim[1],
+        std::min<int>(lastPow2(output_height), max_threads / block_x));
+    int block_z = std::min<int>(
+        maxThreadsDim[2], std::min<int>(nc, max_threads / block_x / block_y));
+    const dim3 block(block_x, block_y, block_z);
+
+    int grid_x = cuda::ATenCeilDiv(output_width, block_x);
+    int grid_y = cuda::ATenCeilDiv(output_height, block_y);
+    int grid_z = std::min<int>(
+        maxGridSize[2], cuda::ATenCeilDiv(nc, block_z * 4));
+    const dim3 grid(grid_x, grid_y, grid_z);
+    // Error out on cases where grid_x & grid_y exceeds limit of launch config, as
+    // the current kernel implementation doesn't loop over the two dimensions.
+    // This is unlikely to happen.
+    // TODO: kernel implementation could stride on spatial dimension. We probably
+    //       need to overhaul the kernel.
+    TORCH_CHECK(
+        grid_x <= maxGridSize[0] && grid_y <= maxGridSize[1],
+        "input tensor has spatial dimension larger than the kernel capacity");
+
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_out_frame", [&] {
+          using accscalar_t = at::acc_type<scalar_t, true>;
+
+          auto idata = input.data_ptr<scalar_t>();
+          auto odata = output_c.data_ptr<scalar_t>();
+
+          upsample_nearest2d_out_frame<scalar_t, accscalar_t>
+              <<<grid, block, 0, stream>>>(
+                  idata,
+                  odata,
+                  nc,
+                  input_height,
+                  input_width,
+                  output_height,
+                  output_width,
+                  height_scale,
+                  width_scale);
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+        });
+
+    if (!output.is_contiguous()) {
+        output.copy_(output_c);
+    }
   }
 }
 
@@ -211,6 +322,10 @@ static void upsample_nearest2d_backward_out_cuda_template(
       grad_output_arg{grad_output_, "grad_output_", 2};
   checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg});
 
+  if (grad_input.numel() == 0) {
+    return;
+  }
+
   int output_height = output_size[0];
   int output_width = output_size[1];
 
@@ -219,44 +334,89 @@ static void upsample_nearest2d_backward_out_cuda_template(
   int input_height = input_size[2];
   int input_width = input_size[3];
 
-  Tensor grad_output = grad_output_.contiguous();
+  const float height_scale = compute_scales_value_backwards<float>(scales_h, output_height, input_height);
+  const float width_scale = compute_scales_value_backwards<float>(scales_w, output_width, input_width);
 
-  if (grad_input.numel() == 0) {
+  auto memory_format = grad_output_.suggest_memory_format();
+
+  if (grad_output_.sizes() == grad_input.sizes()) {
+    grad_input.copy_(grad_output_);
     return;
   }
 
-  // upsample_nearest2d meta call makes sure `nbatch != 0`
-  unsigned int n = grad_input.numel() / nbatch;
-  dim3 bdim{std::min<unsigned int>(
-      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, MAX_THREADS)};
-  dim3 gdim{cuda::ATenCeilDiv(n, bdim.x)};
-  // safe check for int32 indexing; implicitly restrict launch config for kernel
-  TORCH_CHECK(grad_input.numel() <= std::numeric_limits<int32_t>::max());
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest2d_backward_out_frame", [&] {
-        using accscalar_t = at::acc_type<scalar_t, true>;
-
-        auto idata = grad_input.data_ptr<scalar_t>();
-        auto odata = grad_output.data_ptr<scalar_t>();
-
-        const float height_scale = compute_scales_value_backwards<float>(scales_h, output_height, input_height);
-        const float width_scale = compute_scales_value_backwards<float>(scales_w, output_width, input_width);
-
-        upsample_nearest2d_backward_out_frame<scalar_t, accscalar_t>
-            <<<gdim, bdim, 0, stream>>>(
-                odata,
-                nbatch,
-                channels,
-                output_height,
-                output_width,
-                input_height,
-                input_width,
-                idata,
-                height_scale,
-                width_scale);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
+  if (memory_format == at::MemoryFormat::ChannelsLast && channels >= 4 && \
+        grad_input.is_contiguous(memory_format)) {
+    Tensor grad_output = grad_output_.contiguous(at::MemoryFormat::ChannelsLast);
+
+    TORCH_CHECK(grad_input.numel() < std::numeric_limits<int>::max(),
+      "upsample_nearest_nhwc only supports grad_input tensors with less than INT_MAX elements");
+    TORCH_CHECK(grad_output.numel() < std::numeric_limits<int>::max(),
+      "upsample_nearest_nhwc only supports grad_output tensors with less than INT_MAX elements");
+
+    const int num_kernels = grad_input.numel();
+    const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest2d_backward_nhwc_out_frame", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+
+      const scalar_t* go = grad_output.data_ptr<scalar_t>();
+      scalar_t* gi = grad_input.data_ptr<scalar_t>();
+
+      upsample_nearest2d_backward_nhwc_out_frame<scalar_t, accscalar_t>
+        <<<cuda::ATenCeilDiv(num_kernels, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+          go,
+          gi,
+          output_height,
+          output_width,
+          input_height,
+          input_width,
+          channels,
+          height_scale,
+          width_scale,
+          grad_input.numel()
+      );
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+  } else {
+    // This is needed for non-contiguous tensors.
+    Tensor grad_input_c = grad_input.is_contiguous() ? grad_input : at::empty(grad_input.sizes(), grad_input.options());
+    Tensor grad_output = grad_output_.contiguous();
+
+    // upsample_nearest2d meta call makes sure `nbatch != 0`
+    unsigned int n = grad_input.numel() / nbatch;
+    dim3 bdim{std::min<unsigned int>(
+        at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, MAX_THREADS)};
+    dim3 gdim{cuda::ATenCeilDiv(n, bdim.x)};
+    // safe check for int32 indexing; implicitly restrict launch config for kernel
+    TORCH_CHECK(grad_input.numel() <= std::numeric_limits<int32_t>::max());
+
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest2d_backward_out_frame", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+
+      auto idata = grad_input_c.data_ptr<scalar_t>();
+      auto odata = grad_output.data_ptr<scalar_t>();
+
+
+      upsample_nearest2d_backward_out_frame<scalar_t, accscalar_t>
+          <<<gdim, bdim, 0, stream>>>(
+              odata,
+              nbatch,
+              channels,
+              output_height,
+              output_width,
+              input_height,
+              input_width,
+              idata,
+              height_scale,
+              width_scale);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+
+    if (!grad_input.is_contiguous()) {
+        grad_input.copy_(grad_input_c);
+    }
+  }
 }
 
 } // namespace
diff --git a/test/test_nn.py b/test/test_nn.py
index 29f67b2d948da..398ef70c5613c 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -10142,76 +10142,6 @@ def test_upsamplingLinear1d_spatial_invariance(self):
             out_t_5 = m(in_t_9[:, :, :5])
         self.assertEqual(out_t_9[:, :, :15], out_t_5)
 
-    def test_upsamplingNearest2d(self):
-        for memory_format in [torch.contiguous_format, torch.channels_last]:
-            m = nn.Upsample(size=4, mode='nearest')
-            in_t = torch.ones(1, 2, 2, 2).contiguous(memory_format=memory_format)
-            in_uint8_t = torch.ones(1, 2, 2, 2, dtype=torch.uint8).contiguous(memory_format=memory_format)
-            with warnings.catch_warnings(record=True) as w:
-                out_t = m(in_t)
-                out_uint8_t = m(in_uint8_t)
-            self.assertEqual(torch.ones(1, 2, 4, 4), out_t)
-            self.assertEqual(torch.ones(1, 2, 4, 4, dtype=torch.uint8), out_uint8_t)
-            # Assert that memory format is carried through to the output
-            self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
-
-            # test forward when input's height is not same as width
-            m = nn.Upsample(size=(4, 2), mode='nearest')
-            in_t = torch.ones(1, 2, 2, 1).contiguous(memory_format=memory_format)
-            with warnings.catch_warnings(record=True) as w:
-                out_t = m(in_t)
-            self.assertEqual(torch.ones(1, 2, 4, 2), out_t)
-            self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
-
-            # test backward when input's height is not same as width
-            input = torch.ones(1, 2, 2, 1, requires_grad=True).contiguous(memory_format=memory_format)
-            gradcheck(lambda x: F.interpolate(x, size=(4, 2), mode='nearest'), [input])
-            gradgradcheck(lambda x: F.interpolate(x, size=(4, 2), mode='nearest'), [input])
-
-            input = torch.randn(1, 2, 2, 2, requires_grad=True).contiguous(memory_format=memory_format)
-            self.assertEqual(
-                F.interpolate(input, 4, mode='nearest'),
-                F.interpolate(input, scale_factor=2, mode='nearest'))
-            gradcheck(lambda x: F.interpolate(x, 4, mode='nearest'), [input])
-            gradgradcheck(lambda x: F.interpolate(x, 4, mode='nearest'), [input])
-
-            # Assert that cpu and cuda handle channels_last memory format in the same way
-            # https://github.com/pytorch/pytorch/issues/54590
-            if torch.cuda.is_available():
-                a = torch.ones(2, 2, 3, 4, requires_grad=True).contiguous(memory_format=torch.channels_last)
-                # make the data asymmetric; ensure that cuda/cpu handle channels_last appropriately.
-                a[1][1][2][2] = a[1][1][2][3] = 0
-
-                out_cpu = torch.nn.functional.interpolate(a, scale_factor=2, mode='nearest')
-                out_cuda = torch.nn.functional.interpolate(a.to('cuda'), scale_factor=2, mode='nearest')
-                self.assertEqual(out_cpu, out_cuda.to('cpu'))
-
-                gradcheck(lambda x: F.interpolate(x, 4, mode='nearest'), [a])
-                gradgradcheck(lambda x: F.interpolate(x, 4, mode='nearest'), [a])
-
-                gradcheck(lambda x: F.interpolate(x, 4, mode='nearest'), [a.to('cuda')])
-                gradgradcheck(lambda x: F.interpolate(x, 4, mode='nearest'), [a.to('cuda')])
-
-    def test_upsamplingBilinear2d(self):
-        for align_corners in [True, False]:
-            kwargs = dict(mode='bilinear', align_corners=align_corners)
-
-            for memory_format in [torch.contiguous_format, torch.channels_last]:
-
-                # test float scale factor up & downsampling
-                for scale_factor in [0.5, 1.5, 2]:
-                    m = nn.Upsample(scale_factor=scale_factor, **kwargs)
-                    in_t = torch.ones(1, 2, 2, 2).contiguous(memory_format=memory_format)
-                    out_size = int(math.floor(in_t.shape[-1] * scale_factor))
-                    with warnings.catch_warnings(record=True) as w:
-                        out_t = m(in_t)
-                    self.assertEqual(torch.ones(1, 2, out_size, out_size), out_t.data)
-                    # Assert that memory format is carried through to the output
-                    self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
-
-                    input = torch.randn(1, 2, 2, 2, requires_grad=True)
-                    gradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [input])
-
     def test_upsamplingBicubic2d(self):
         # test output against known input: align_corners=False result must match opencv
         in_t = torch.arange(8.).view(1, 2, 2, 2)
@@ -13910,6 +13840,104 @@ def helper(n, c, h, w, ks):
         helper(2, 8, 4, 4, ks=2)
         helper(None, 3, 50, 50, ks=5)
 
+    def test_upsamplingNearest2d(self, device):
+        for memory_format in [torch.contiguous_format, torch.channels_last]:
+            in_t = torch.ones(1, 2, 2, 2, device=device).contiguous(memory_format=memory_format)
+            in_uint8_t = torch.ones(1, 2, 2, 2, dtype=torch.uint8, device=device).contiguous(memory_format=memory_format)
+            with warnings.catch_warnings(record=True) as w:
+                out_t = F.interpolate(in_t, size=4, mode='nearest')
+                out_uint8_t = F.interpolate(in_uint8_t, size=4, mode='nearest')
+            self.assertEqual(torch.ones(1, 2, 4, 4, device=device), out_t)
+            self.assertEqual(torch.ones(1, 2, 4, 4, dtype=torch.uint8, device=device), out_uint8_t)
+            # Assert that memory format is carried through to the output
+            self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
+
+            # test forward when input's height is not same as width
+            in_t = torch.ones(1, 2, 2, 1, device=device).contiguous(memory_format=memory_format).requires_grad_()
+            with warnings.catch_warnings(record=True) as w:
+                out_t = F.interpolate(in_t, size=(4, 2), mode='nearest')
+            self.assertEqual(torch.ones(1, 2, 4, 2, device=device), out_t)
+            self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
+
+            out_t.backward(torch.randn_like(out_t))
+            self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
+
+            # test backward when input's height is not same as width
+            input = torch.ones(1, 2, 2, 1, requires_grad=True, device=device).contiguous(memory_format=memory_format)
+            gradcheck(lambda x: F.interpolate(x, size=(4, 2), mode='nearest'), [input])
+            gradgradcheck(lambda x: F.interpolate(x, size=(4, 2), mode='nearest'), [input])
+
+            input = torch.randn(1, 2, 2, 2, requires_grad=True, device=device).contiguous(memory_format=memory_format)
+            self.assertEqual(
+                F.interpolate(input, 4, mode='nearest'),
+                F.interpolate(input, scale_factor=2, mode='nearest'))
+            gradcheck(lambda x: F.interpolate(x, 4, mode='nearest'), [input])
+            gradgradcheck(lambda x: F.interpolate(x, 4, mode='nearest'), [input])
+
+            # Assert that cpu and cuda handle channels_last memory format in the same way
+            # https://github.com/pytorch/pytorch/issues/54590
+            if torch.device(device).type == 'cuda':
+                for shapes, scale_factor in product([
+                    (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2)
+                ], [0.5, 1.5, 2]):
+                    a_cuda = torch.randn(*shapes, device=device).contiguous(memory_format=memory_format).requires_grad_()
+                    a_cpu = a_cuda.detach().cpu().requires_grad_()
+
+                    with warnings.catch_warnings(record=True):
+                        out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, mode='nearest')
+                        out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, mode='nearest')
+
+                    self.assertEqual(out_cpu.cuda(), out_cuda)
+
+                    g_cuda = torch.randn_like(out_cuda)
+                    g_cpu = g_cuda.cpu()
+
+                    out_cuda.backward(g_cuda)
+                    out_cpu.backward(g_cpu)
+
+                    self.assertEqual(a_cuda.grad, a_cpu.grad)
+
+    def test_upsamplingBilinear2d(self, device):
+        for align_corners in [True, False]:
+            kwargs = dict(mode='bilinear', align_corners=align_corners)
+            for memory_format in [torch.contiguous_format, torch.channels_last]:
+                # test float scale factor up & downsampling
+                for scale_factor in [0.5, 1.5, 2]:
+                    in_t = torch.ones(1, 2, 2, 2, device=device).contiguous(memory_format=memory_format).requires_grad_()
+                    out_size = int(math.floor(in_t.shape[-1] * scale_factor))
+                    with warnings.catch_warnings(record=True) as w:
+                        out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
+                    self.assertEqual(torch.ones(1, 2, out_size, out_size, device=device), out_t.data)
+                    # Assert that memory format is carried through to the output
+                    self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
+                    out_t.backward(torch.randn_like(out_t))
+                    self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
+
+                    input = torch.randn(1, 2, 2, 2, device=device).contiguous(memory_format=memory_format).requires_grad_()
+                    gradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [input])
+
+                    # Assert that cpu and cuda give same results
+                    if torch.device(device).type == 'cuda':
+                        for shapes in [
+                            (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2)
+                        ]:
+                            a_cuda = torch.randn(*shapes, device=device).contiguous(memory_format=memory_format).requires_grad_()
+                            a_cpu = a_cuda.detach().cpu().requires_grad_()
+
+                            with warnings.catch_warnings(record=True):
+                                out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, **kwargs)
+                                out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, **kwargs)
+
+                            self.assertEqual(out_cpu.cuda(), out_cuda)
+
+                            g_cuda = torch.randn_like(out_cuda)
+                            g_cpu = g_cuda.cpu()
+
+                            out_cuda.backward(g_cuda)
+                            out_cpu.backward(g_cpu)
+
+                            self.assertEqual(a_cuda.grad, a_cpu.grad)
+
     @onlyCPU
     @dtypes(torch.float, torch.double)
     def test_adaptive_pooling_max_nhwc(self, device, dtype):

From 57676ce1283065c96adf58779778db00a73ef86d Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Sat, 10 Jul 2021 18:47:31 -0700
Subject: [PATCH 073/122] Migrate multi_margin_loss to ATen (CUDA) (#61426)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61426

Closes gh-24600, closes gh-24601

These operators use custom kernels that aren't well suited to `TensorIterator` style, so this is just changing the CPU code and cleaning up the style.

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D29648015

Pulled By: ngimel

fbshipit-source-id: cadf1890cdc2199d57f4533370e554613efeb54a
---
 aten/src/ATen/LegacyTHFunctionsCUDA.h         |   4 -
 aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp  | 178 --------
 aten/src/ATen/native/cuda/MultiMarginLoss.cu  | 384 ++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml    |   8 +-
 aten/src/THCUNN/CMakeLists.txt                |   1 -
 aten/src/THCUNN/MultiMarginCriterion.cu       | 122 ------
 .../THCUNN/generic/MultiMarginCriterion.cu    | 289 -------------
 7 files changed, 388 insertions(+), 598 deletions(-)
 create mode 100644 aten/src/ATen/native/cuda/MultiMarginLoss.cu
 delete mode 100644 aten/src/THCUNN/MultiMarginCriterion.cu
 delete mode 100644 aten/src/THCUNN/generic/MultiMarginCriterion.cu

diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
index f86e594b21caa..043b048e7d883 100644
--- a/aten/src/ATen/LegacyTHFunctionsCUDA.h
+++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h
@@ -23,10 +23,6 @@ std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);
 Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper);
 Tensor _th_potri(const Tensor & self, bool upper);
 Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src);
-Tensor & _thnn_multi_margin_loss_forward_out(const Tensor & self, const Tensor & target, const Scalar& p, const Scalar& margin, const c10::optional<Tensor>& weight_opt, int64_t reduction, Tensor & output);
-Tensor _thnn_multi_margin_loss_forward(const Tensor & self, const Tensor & target, const Scalar& p, const Scalar& margin, const optional<Tensor> & weight, int64_t reduction);
-Tensor & _thnn_multi_margin_loss_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Scalar& p, const Scalar& margin, const c10::optional<Tensor>& weight_opt, int64_t reduction, Tensor & grad_input);
-Tensor _thnn_multi_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Scalar& p, const Scalar& margin, const optional<Tensor> & weight, int64_t reduction);
 std::tuple<Tensor &,Tensor &> _thnn_multilabel_margin_loss_forward_out(const Tensor & self, const Tensor & target, int64_t reduction, Tensor & output, Tensor & is_target);
 std::tuple<Tensor,Tensor> _thnn_multilabel_margin_loss_forward(const Tensor & self, const Tensor & target, int64_t reduction);
 Tensor & _thnn_multilabel_margin_loss_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, const Tensor & is_target, Tensor & grad_input);
diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
index 653019c114960..f3acbd4dd5817 100644
--- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
+++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
@@ -175,184 +175,6 @@ Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src) {
     }
     return self;
 }
-Tensor & _thnn_multi_margin_loss_forward_out(const Tensor & self, const Tensor & target, const Scalar& p, const Scalar& margin, const c10::optional<Tensor>& weight_opt, int64_t reduction, Tensor & output) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
-    const OptionalDeviceGuard device_guard(device_of(self));
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
-            auto p_ = p.toDouble();
-            auto margin_ = margin.toDouble();
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaDoubleMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
-            auto p_ = p.toDouble();
-            auto margin_ = margin.toDouble();
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, ScalarType::Long);
-            auto p_ = p.toDouble();
-            auto margin_ = margin.toDouble();
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_multi_margin_loss_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaHalfMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_);
-            break;
-        }
-        default:
-            AT_ERROR("_thnn_multi_margin_loss_forward_out not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return output;
-}
-Tensor _thnn_multi_margin_loss_forward(const Tensor & self, const Tensor & target, const Scalar& p, const Scalar& margin, const c10::optional<Tensor>& weight_opt, int64_t reduction) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
-    const OptionalDeviceGuard device_guard(device_of(self));
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
-            auto p_ = p.toDouble();
-            auto margin_ = margin.toDouble();
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaDoubleMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
-            auto p_ = p.toDouble();
-            auto margin_ = margin.toDouble();
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto target_ = checked_dense_tensor_unwrap(target, "target", 2, "_thnn_multi_margin_loss_forward", false, DeviceType::CUDA, ScalarType::Long);
-            auto p_ = p.toDouble();
-            auto margin_ = margin.toDouble();
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 5, "_thnn_multi_margin_loss_forward", true, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaHalfMultiMarginCriterion_updateOutput(globalContext().getTHCState(), self_, target_, output_, reduction, p_, weight_ ? weight_ : NULL, margin_);
-            break;
-        }
-        default:
-            AT_ERROR("_thnn_multi_margin_loss_forward not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return output;
-}
-Tensor & _thnn_multi_margin_loss_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Scalar& p, const Scalar& margin, const c10::optional<Tensor>& weight_opt, int64_t reduction, Tensor & grad_input) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
-    const OptionalDeviceGuard device_guard(device_of(self));
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
-            auto p_ = p.toDouble();
-            auto margin_ = margin.toDouble();
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaDoubleMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
-            auto p_ = p.toDouble();
-            auto margin_ = margin.toDouble();
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, ScalarType::Long);
-            auto p_ = p.toDouble();
-            auto margin_ = margin.toDouble();
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 7, "_thnn_multi_margin_loss_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaHalfMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_);
-            break;
-        }
-        default:
-            AT_ERROR("_thnn_multi_margin_loss_backward_out not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return grad_input;
-}
-Tensor _thnn_multi_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Scalar& p, const Scalar& margin, const c10::optional<Tensor>& weight_opt, int64_t reduction) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
-    const OptionalDeviceGuard device_guard(device_of(self));
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto grad_input_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto grad_input = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_input_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
-            auto p_ = p.toDouble();
-            auto margin_ = margin.toDouble();
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaDoubleMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
-            auto p_ = p.toDouble();
-            auto margin_ = margin.toDouble();
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto target_ = checked_dense_tensor_unwrap(target, "target", 3, "_thnn_multi_margin_loss_backward", false, DeviceType::CUDA, ScalarType::Long);
-            auto p_ = p.toDouble();
-            auto margin_ = margin.toDouble();
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 6, "_thnn_multi_margin_loss_backward", true, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaHalfMultiMarginCriterion_updateGradInput(globalContext().getTHCState(), self_, target_, grad_output_, grad_input_, reduction, p_, weight_ ? weight_ : NULL, margin_);
-            break;
-        }
-        default:
-            AT_ERROR("_thnn_multi_margin_loss_backward not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return grad_input;
-}
 std::tuple<Tensor &,Tensor &> _thnn_multilabel_margin_loss_forward_out(const Tensor & self, const Tensor & target, int64_t reduction, Tensor & output, Tensor & is_target) {
     const OptionalDeviceGuard device_guard(device_of(self));
     auto dispatch_scalar_type = infer_scalar_type(self);
diff --git a/aten/src/ATen/native/cuda/MultiMarginLoss.cu b/aten/src/ATen/native/cuda/MultiMarginLoss.cu
new file mode 100644
index 0000000000000..fcf0a6a2356a3
--- /dev/null
+++ b/aten/src/ATen/native/cuda/MultiMarginLoss.cu
@@ -0,0 +1,384 @@
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/native/Resize.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/cuda/CUDAException.h>
+
+namespace at {
+namespace native {
+namespace {
+constexpr int MULTIMARGIN_THREADS = 128;
+
+template <int P, typename scalar_t>
+__global__ void MultiMarginLoss_forward_kernel(
+    scalar_t *output, scalar_t *input, int64_t *target, scalar_t *weights,
+    int nframe, int dim, bool sizeAverage, scalar_t margin) {
+  using acc_t = at::acc_type<scalar_t, true>;
+  __shared__ acc_t buffer[MULTIMARGIN_THREADS];
+  int k = blockIdx.x;
+  scalar_t *input_k = input + k*dim;
+  scalar_t *output_k = output + k;
+  int target_k = static_cast<int>(target[k]);
+  scalar_t input_target_k = input_k[target_k];
+
+  int i_start = threadIdx.x;
+  int i_end = dim;
+  int i_step = blockDim.x;
+
+  buffer[threadIdx.x] = 0;
+  for (int i = i_start; i < i_end; i += i_step) {
+    scalar_t z = margin - input_target_k + input_k[i];
+    if (i == target_k) {
+      continue;
+    }
+
+    if (z > 0) {
+      scalar_t h = (P==1) ? z : z*z;
+      if (weights) {
+        h *= weights[target_k];
+      }
+      buffer[threadIdx.x] += h;
+    }
+  }
+  __syncthreads();
+
+  // reduce
+  if (threadIdx.x == 0) {
+    acc_t sum = 0;
+    for (int i=0; i < blockDim.x; i++)
+      sum += buffer[i];
+
+    const int denom = sizeAverage ? nframe * dim : dim;
+    *output_k = static_cast<scalar_t>(sum / denom);
+  }
+}
+
+template <int P, typename scalar_t>
+__global__ void MultiMarginLoss_backward_kernel(
+    scalar_t *gradInput, scalar_t *gradOutput, scalar_t *input, int64_t *target,
+    scalar_t *weights, int nframe, int dim, bool sizeAverage, scalar_t margin,
+    bool reduce) {
+  using acc_t = at::acc_type<scalar_t, true>;
+  __shared__ acc_t buffer[MULTIMARGIN_THREADS];
+  int k = blockIdx.x;
+  scalar_t *input_k = input + k*dim;
+  scalar_t *gradInput_k = gradInput + k*dim;
+  int target_k = static_cast<int>(target[k]);
+  scalar_t input_target_k = input_k[target_k];
+
+  scalar_t *gradOutput_k = gradOutput;
+  if (!reduce) {
+    gradOutput_k += k;
+  }
+
+  const int denom = sizeAverage && reduce ? nframe * dim : dim;
+  const acc_t g = acc_t(1) / static_cast<acc_t>(denom);
+
+  int i_start = threadIdx.x;
+  int i_end = dim;
+  int i_step = blockDim.x;
+
+  buffer[threadIdx.x] = 0;
+  for (int i=i_start; i<i_end; i+=i_step) {
+    scalar_t z = margin - input_target_k + input_k[i];
+    if (i == target_k) {
+      continue;
+    }
+
+    if (z > 0) {
+      acc_t h = (P == 1) ? g : 2*g*z;
+      if (weights) {
+        h *= weights[target_k];
+      }
+
+      buffer[threadIdx.x] -= static_cast<scalar_t>(h);
+      gradInput_k[i] = static_cast<scalar_t>(h);
+    } else {
+      gradInput_k[i] = static_cast<scalar_t>(0);
+    }
+  }
+
+  __syncthreads();
+
+  // reduce
+  if (threadIdx.x == 0) {
+    acc_t gradInput_target_k = 0;
+    for (int i=0; i<blockDim.x; i++) {
+      gradInput_target_k += buffer[i];
+    }
+    gradInput_k[target_k] = static_cast<scalar_t>(gradInput_target_k);
+  }
+
+  for (int i=i_start; i<i_end; i+= i_step) {
+    gradInput_k[i] *= * gradOutput_k;
+  }
+}
+
+void multi_margin_loss_shape_check(
+    const Tensor &input, const Tensor &target) {
+  auto in_sizes = input.sizes();
+  auto dims = in_sizes.size();
+
+  TORCH_CHECK(
+      (dims == 2 && in_sizes[1] != 0) || (dims == 1 && in_sizes[0] != 0) || dims == 0,
+      "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+      in_sizes);
+
+  int64_t nframe = dims <= 1 ? 1 : in_sizes[0];
+  TORCH_CHECK(
+      target.dim() <= 1 && target.numel() == nframe,
+      "inconsistent target size, expected ", nframe, " but got ",
+      target.sizes());
+}
+
+}  // namespace (anonymous)
+
+Tensor& multi_margin_loss_cuda_out(
+    const Tensor &input_, const Tensor &target_, const Scalar &p_, const Scalar &margin_,
+    const c10::optional<Tensor> &weights_, int64_t reduction, Tensor& out_) {
+  auto p = p_.toLong();
+  TORCH_CHECK(p == 1 || p == 2, "multi_margin_loss: Invalid p, expected 1 or 2 but got ", p);
+  multi_margin_loss_shape_check(input_, target_);
+
+  if (reduction == at::Reduction::None) {
+    resize_output(out_, target_.sizes());
+  } else if (input_.dim() == 2) {
+    resize_output(out_, {input_.sizes()[0]});
+  } else {
+    resize_output(out_, {});
+  }
+
+  if (input_.numel() == 0) {
+    return out_;
+  }
+
+  auto input = input_.contiguous();
+  auto target = target_.contiguous();
+  Tensor weights;
+  if (weights_ && weights_->defined()) {
+    weights = weights_->contiguous();
+  }
+  auto out = (out_.is_contiguous() ? out_ :
+              at::empty(out_.sizes(), input.options()));
+
+  const auto stream = c10::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "multi_margin_loss_cuda", [&] {
+    const scalar_t margin = margin_.to<scalar_t>();
+    if (input.dim() <= 1) {
+      int nframe = 1;
+      TORCH_CHECK(target.dim() <= 1 && target.numel() == nframe, "inconsistent target size");
+      dim3 blocks(1);
+      dim3 threads(MULTIMARGIN_THREADS);
+      if (p == 1) {
+        MultiMarginLoss_forward_kernel<1> <<<blocks, threads, 0, stream>>>(
+            out.data_ptr<scalar_t>(),
+            input.data_ptr<scalar_t>(),
+            target.data_ptr<int64_t>(),
+            weights.defined() ? weights.data_ptr<scalar_t>() : nullptr,
+            1,
+            input.dim() < 1 ? input.numel() : input.sizes()[0],
+            reduction == at::Reduction::Mean,
+            margin);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      } else if (p == 2) {
+        MultiMarginLoss_forward_kernel<2> <<<blocks, threads, 0, stream>>>(
+            out.data_ptr<scalar_t>(),
+            input.data_ptr<scalar_t>(),
+            target.data_ptr<int64_t>(),
+            weights.defined() ? weights.data_ptr<scalar_t>() : nullptr,
+            1,
+            input.dim() < 1 ? input.numel() : input.sizes()[0],
+            reduction == at::Reduction::Mean,
+            margin);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      }
+    } else {
+      auto in_sizes = input.sizes();
+      TORCH_INTERNAL_ASSERT(in_sizes.size() == 2);
+      int nframe = in_sizes[0];
+      // allow zero-dim target for 2D input.
+      TORCH_CHECK(in_sizes[1] != 0 && target.dim() <= 1 && target.numel() == nframe,
+                "inconsistent target size");
+      dim3 blocks(nframe);
+      dim3 threads(MULTIMARGIN_THREADS);
+
+      if (reduction == at::Reduction::None) {
+        if (p == 1) {
+          MultiMarginLoss_forward_kernel<1> <<<blocks, threads, 0, stream>>>(
+              out.data_ptr<scalar_t>(),
+              input.data_ptr<scalar_t>(),
+              target.data_ptr<int64_t>(),
+              weights.defined() ? weights.data_ptr<scalar_t>() : nullptr,
+              nframe, in_sizes[1],
+              false,
+              margin);
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+        } else if (p == 2) {
+          MultiMarginLoss_forward_kernel<2> <<<blocks, threads, 0, stream>>>(
+              out.data_ptr<scalar_t>(),
+              input.data_ptr<scalar_t>(),
+              target.data_ptr<int64_t>(),
+              weights.defined() ? weights.data_ptr<scalar_t>() : nullptr,
+              nframe, in_sizes[1],
+              false,
+              margin);
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+        }
+      } else {
+        auto tmp_output = at::empty({nframe}, input.options());
+        if (p == 1) {
+          MultiMarginLoss_forward_kernel<1> <<<blocks, threads, 0, stream>>>(
+              tmp_output.data_ptr<scalar_t>(),
+              input.data_ptr<scalar_t>(),
+              target.data_ptr<int64_t>(),
+              weights.defined() ? weights.data_ptr<scalar_t>() : nullptr,
+              nframe, in_sizes[1],
+              reduction == Reduction::Mean,
+              margin);
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+        } else if (p == 2) {
+          MultiMarginLoss_forward_kernel<2> <<<blocks, threads, 0, stream>>>(
+              tmp_output.data_ptr<scalar_t>(),
+              input.data_ptr<scalar_t>(),
+              target.data_ptr<int64_t>(),
+              weights.defined() ? weights.data_ptr<scalar_t>() : nullptr,
+              nframe, in_sizes[1],
+              reduction == Reduction::Mean,
+              margin);
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+        }
+        at::sum_out(out, tmp_output, /*dims=*/IntArrayRef{});
+      }
+    }
+  });
+
+  if (!out.is_alias_of(out_)) {
+    out_.copy_(out);
+  }
+  return out_;
+}
+
+Tensor multi_margin_loss_cuda(
+    const Tensor &input, const Tensor &target, const Scalar &p, const Scalar &margin,
+    const c10::optional<Tensor> &weights, int64_t reduction) {
+  auto out = at::empty({}, input.options());
+  multi_margin_loss_cuda_out(input, target, p, margin, weights, reduction, out);
+  return out;
+}
+
+Tensor& multi_margin_loss_cuda_backward_out(
+    const Tensor &grad_output_,const Tensor &input_, const Tensor &target_,
+    const Scalar &p_, const Scalar &margin_, const c10::optional<Tensor> &weights_,
+    int64_t reduction, Tensor &grad_input_) {
+  auto p = p_.toLong();
+  TORCH_CHECK(p == 1 || p == 2,
+              "multi_margin_loss_backward: Invalid p, expected 1 or 2 but got ", p);
+  multi_margin_loss_shape_check(input_, target_);
+  resize_output(grad_input_, input_.sizes());
+
+  if (input_.numel() == 0) {
+    return grad_input_;
+  }
+
+  auto input = input_.contiguous();
+  auto grad_input = (grad_input_.is_contiguous() ? grad_input_ :
+                     at::empty(grad_input_.sizes(), input.options()));
+  auto grad_output = grad_output_.contiguous();
+  auto target = target_.contiguous();
+  Tensor weights;
+  if (weights_ && weights_->defined()) {
+    weights = weights_->contiguous();
+  }
+
+  const auto stream = c10::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+                                  "multi_margin_loss_backward_cuda", [&] {
+    const scalar_t margin = margin_.to<scalar_t>();
+
+    if (input.dim() <= 1) {
+      dim3 blocks(1);
+      dim3 threads(MULTIMARGIN_THREADS);
+
+      if (p == 1) {
+        MultiMarginLoss_backward_kernel<1> <<<blocks, threads, 0, stream>>>(
+            grad_input.data_ptr<scalar_t>(),
+            grad_output.data_ptr<scalar_t>(),
+            input.data_ptr<scalar_t>(),
+            target.data_ptr<int64_t>(),
+            weights.defined() ? weights.data_ptr<scalar_t>() : nullptr,
+            1,
+            input.dim() == 0 ? 1 : input.sizes()[0],
+            reduction == at::Reduction::Mean,
+            margin,
+            reduction != at::Reduction::None);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      } else if (p == 2) {
+        MultiMarginLoss_backward_kernel<2> <<<blocks, threads, 0, stream>>>(
+            grad_input.data_ptr<scalar_t>(),
+            grad_output.data_ptr<scalar_t>(),
+            input.data_ptr<scalar_t>(),
+            target.data_ptr<int64_t>(),
+            weights.defined() ? weights.data_ptr<scalar_t>() : nullptr,
+            1,
+            input.dim() == 0 ? 1 : input.sizes()[0],
+            reduction == at::Reduction::Mean,
+            margin,
+            reduction != at::Reduction::None);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      }
+    } else {
+      auto in_sizes = input.sizes();
+      TORCH_INTERNAL_ASSERT(in_sizes.size() == 2);
+      int nframe = in_sizes[0];
+      TORCH_CHECK((in_sizes[1] != 0) && (target.dim() <= 1) && (target.numel() == nframe),
+                  "inconsistent target size");
+      dim3 blocks(in_sizes[0]);
+      dim3 threads(MULTIMARGIN_THREADS);
+
+      if (p == 1) {
+        MultiMarginLoss_backward_kernel<1> <<<blocks, threads, 0, stream>>>(
+            grad_input.data_ptr<scalar_t>(),
+            grad_output.data_ptr<scalar_t>(),
+            input.data_ptr<scalar_t>(),
+            target.data_ptr<int64_t>(),
+            weights.defined() ? weights.data_ptr<scalar_t>() : nullptr,
+            nframe, in_sizes[1],
+            reduction == at::Reduction::Mean,
+            margin,
+            reduction != at::Reduction::None);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      } else if (p == 2) {
+        MultiMarginLoss_backward_kernel<2> <<<blocks, threads, 0, stream>>>(
+            grad_input.data_ptr<scalar_t>(),
+            grad_output.data_ptr<scalar_t>(),
+            input.data_ptr<scalar_t>(),
+            target.data_ptr<int64_t>(),
+            weights.defined() ? weights.data_ptr<scalar_t>() : nullptr,
+            nframe, in_sizes[1],
+            reduction == at::Reduction::Mean,
+            margin,
+            reduction != at::Reduction::None);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      }
+    }
+  });
+
+  if (!grad_input.is_alias_of(grad_input_)) {
+    grad_input_.copy_(grad_input);
+  }
+  return grad_input_;
+}
+
+Tensor multi_margin_loss_cuda_backward(
+    const Tensor &grad_output, const Tensor &input, const Tensor &target,
+    const Scalar &p, const Scalar &margin, const c10::optional<Tensor> &weights,
+    int64_t reduction) {
+  auto grad_input = at::empty({}, input.options());
+  multi_margin_loss_cuda_backward_out(
+      grad_output, input, target, p, margin, weights, reduction, grad_input);
+  return grad_input;
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index e8c5dc9e8f00c..41c68fa24349d 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -8113,25 +8113,25 @@
   python_module: nn
   dispatch:
     CPU: multi_margin_loss_cpu_out
-    CUDA: legacy::cuda::_thnn_multi_margin_loss_forward_out
+    CUDA: multi_margin_loss_cuda_out
 
 - func: multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor
   python_module: nn
   dispatch:
     CPU: multi_margin_loss_cpu
-    CUDA: legacy::cuda::_thnn_multi_margin_loss_forward
+    CUDA: multi_margin_loss_cuda
 
 - func: multi_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: multi_margin_loss_cpu_backward_out
-    CUDA: legacy::cuda::_thnn_multi_margin_loss_backward_out
+    CUDA: multi_margin_loss_cuda_backward_out
 
 - func: multi_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean) -> Tensor
   python_module: nn
   dispatch:
     CPU: multi_margin_loss_cpu_backward
-    CUDA: legacy::cuda::_thnn_multi_margin_loss_backward
+    CUDA: multi_margin_loss_cuda_backward
 
 - func: multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt
index 495d0129d5bbe..2901aa2ff7e83 100644
--- a/aten/src/THCUNN/CMakeLists.txt
+++ b/aten/src/THCUNN/CMakeLists.txt
@@ -1,6 +1,5 @@
 set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS}
 ${CMAKE_CURRENT_SOURCE_DIR}/MultiLabelMarginCriterion.cu
-${CMAKE_CURRENT_SOURCE_DIR}/MultiMarginCriterion.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialClassNLLCriterion.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialConvolutionMM.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialDepthwiseConvolution.cu
diff --git a/aten/src/THCUNN/MultiMarginCriterion.cu b/aten/src/THCUNN/MultiMarginCriterion.cu
deleted file mode 100644
index 253f937f8d60d..0000000000000
--- a/aten/src/THCUNN/MultiMarginCriterion.cu
+++ /dev/null
@@ -1,122 +0,0 @@
-#include <THCUNN/THCUNN.h>
-#include <THCUNN/common.h>
-#include <TH/THHalf.h>
-#include <THC/THCNumerics.cuh>
-#include <THC/THCTensor.hpp>
-#include <THC/THCStorage.hpp>
-
-#define MULTIMARGIN_THREADS 128
-
-template <int P, typename Dtype, typename Acctype>
-__global__ void cunn_MultiMarginCriterion_updateOutput_kernel(Dtype *output, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin)
-{
-  __shared__ Acctype buffer[MULTIMARGIN_THREADS];
-  int k = blockIdx.x;
-  Dtype *input_k = input + k*dim;
-  Dtype *output_k = output + k;
-  int target_k = ((int)target[k]);
-  Dtype input_target_k = input_k[target_k];
-
-  int i_start = threadIdx.x;
-  int i_end = dim;
-  int i_step = blockDim.x;
-
-  buffer[threadIdx.x] = 0;
-  for (int i = i_start; i < i_end; i += i_step)
-  {
-    Dtype z = margin - input_target_k + input_k[i];
-    if (i == target_k)
-      continue;
-
-    if (z > 0) {
-      Dtype h = (P==1) ? z : z*z;
-      if(weights)
-        h *= weights[target_k];
-      buffer[threadIdx.x] += h;
-    }
-  }
-  __syncthreads();
-
-  // reduce
-  if (threadIdx.x == 0)
-  {
-    Acctype sum = 0;
-    for (int i=0; i < blockDim.x; i++)
-      sum += buffer[i];
-
-    *output_k = ScalarConvert<Acctype, Dtype>::to(sum/dim);
-    if(sizeAverage)
-      *output_k /= nframe;
-  }
-}
-
-template <int P, typename Dtype, typename Acctype>
-__global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(Dtype *gradInput,
-                                                                 Dtype *gradOutput,
-                                                                 Dtype *input,
-                                                                 THCIndex_t *target,
-                                                                 Dtype *weights,
-                                                                 int nframe,
-                                                                 int dim,
-                                                                 bool sizeAverage,
-                                                                 Dtype margin,
-                                                                 int reduce)
-{
-  __shared__ Acctype buffer[MULTIMARGIN_THREADS];
-  int k = blockIdx.x;
-  Dtype *input_k = input + k*dim;
-  Dtype *gradInput_k = gradInput + k*dim;
-  int target_k = ((int)target[k]);
-  Dtype input_target_k = input_k[target_k];
-
-  Dtype *gradOutput_k = gradOutput;
-  if (!reduce) {
-    gradOutput_k += k;
-  }
-
-  Acctype g = (sizeAverage && reduce ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim));
-
-  int i_start = threadIdx.x;
-  int i_end = dim;
-  int i_step = blockDim.x;
-
-  buffer[threadIdx.x] = 0;
-  for (int i=i_start; i<i_end; i+=i_step)
-  {
-    Dtype z = margin - input_target_k + input_k[i];
-    if (i == target_k)
-      continue;
-
-    if (z > 0)
-    {
-      Dtype h = ScalarConvert<Acctype, Dtype>::to((P == 1) ? g : 2*g*z);
-      if(weights)
-        h *= weights[target_k];
-      buffer[threadIdx.x] -= h;
-      gradInput_k[i] = h;
-    }
-    else
-      gradInput_k[i] = ScalarConvert<int, Dtype>::to(0);
-  }
-
-  __syncthreads();
-
-  // reduce
-  if (threadIdx.x == 0)
-  {
-    Acctype gradInput_target_k = 0;
-    for (int i=0; i<blockDim.x; i++)
-      gradInput_target_k += buffer[i];
-    gradInput_k[target_k] = ScalarConvert<Acctype, Dtype>::to(gradInput_target_k);
-  }
-
-  for (int i=i_start; i<i_end; i+= i_step)
-  {
-    gradInput_k[i] *= * gradOutput_k;
-  }
-}
-
-#include <THCUNN/generic/MultiMarginCriterion.cu>
-#include <THC/THCGenerateFloatTypes.h>
-
-#undef MULTIMARGIN_THREADS
diff --git a/aten/src/THCUNN/generic/MultiMarginCriterion.cu b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
deleted file mode 100644
index a09148c52c859..0000000000000
--- a/aten/src/THCUNN/generic/MultiMarginCriterion.cu
+++ /dev/null
@@ -1,289 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THCUNN/generic/MultiMarginCriterion.cu"
-#else
-
-static inline void THNN_(MultiMarginCriterion_shapeCheck)(
-                         THCState *state,
-                         THCTensor *input, THCTensor *target) {
-  int64_t nframe;
-  int64_t ndims = input->dim();
-  bool valid_inputs = (ndims == 2 && input->size(1) != 0) || (ndims == 1 && input->size(0) != 0) || ndims == 0;
-  if (ndims <= 1) {
-    nframe = 1;
-  } else {
-    nframe = input->size(0);
-  }
-
-  TORCH_CHECK(
-    valid_inputs,
-    "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
-    input->sizes());
-  TORCH_CHECK(
-      valid_inputs && target->dim() <= 1 && target->numel() == nframe,
-      "inconsistent target size, got: ",
-      target->sizes());
-}
-
-// TODO: improve error messages
-void THNN_(MultiMarginCriterion_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCIndexTensor *target,
-           THCTensor *output,
-           int64_t reduction,
-           int p,
-           THCTensor *weights,
-           accreal margin_)
-{
-  THNN_(MultiMarginCriterion_shapeCheck)(state, input, target);
-  if (input->numel() == 0) {
-    return;
-  }
-  scalar_t margin = ScalarConvert<accreal, scalar_t>::to(margin_);
-  THCUNN_assertSameGPU(state, 2, input, target);
-  input = THCTensor_(newContiguous)(state, input);
-  if(weights)
-    weights = THCTensor_(newContiguous)(state, weights);
-  if (THTensor_nDimensionLegacyNoScalars(input) == 1)
-  {
-    int nframe = 1;
-    THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
-               "inconsistent target size");
-    dim3 blocks(1);
-    dim3 threads(MULTIMARGIN_THREADS);
-    if (reduction == at::Reduction::None) {
-      THCTensor_(resizeAs)(state, output, target);
-    } else {
-      THCTensor_(resize0d)(state, output);
-    }
-    if (p == 1)
-    {
-      cunn_MultiMarginCriterion_updateOutput_kernel<1, scalar_t, accreal> <<<blocks,threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
-        THCTensor_(data)(state, output),
-        THCTensor_(data)(state, input),
-        THCIndexTensor_(data)(state, target),
-        weights ? THCTensor_(data)(state, weights) : NULL,
-        1, THTensor_sizeLegacyNoScalars(input, 0),
-        reduction == at::Reduction::Mean,
-        margin
-      );
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }
-    else if (p == 2)
-    {
-      cunn_MultiMarginCriterion_updateOutput_kernel<2, scalar_t, accreal> <<<blocks,threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
-        THCTensor_(data)(state, output),
-        THCTensor_(data)(state, input),
-        THCIndexTensor_(data)(state, target),
-        weights ? THCTensor_(data)(state, weights) : NULL,
-        1, THTensor_sizeLegacyNoScalars(input, 0),
-        reduction == at::Reduction::Mean,
-        margin
-      );
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }
-    THCudaCheck(cudaGetLastError());
-  }
-  else if (input->dim() == 2)
-  {
-    int nframe = input->size(0);
-    // allow zero-dim target for 2D input.
-    THArgCheck((input->size(1) != 0) && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
-               "inconsistent target size");
-    dim3 blocks(input->size(0));
-    dim3 threads(MULTIMARGIN_THREADS);
-
-    if (reduction == at::Reduction::None)
-    {
-      THCTensor_(resizeAs)(state, output, target);
-      if (p == 1)
-      {
-        cunn_MultiMarginCriterion_updateOutput_kernel<1, scalar_t, accreal> <<<blocks,threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
-          THCTensor_(data)(state, output),
-          THCTensor_(data)(state, input),
-          THCIndexTensor_(data)(state, target),
-          weights ? THCTensor_(data)(state, weights) : NULL,
-          nframe, input->size(1),
-          false,
-          margin
-        );
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }
-      else if (p == 2)
-      {
-        cunn_MultiMarginCriterion_updateOutput_kernel<2, scalar_t, accreal> <<<blocks,threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
-          THCTensor_(data)(state, output),
-          THCTensor_(data)(state, input),
-          THCIndexTensor_(data)(state, target),
-          weights ? THCTensor_(data)(state, weights) : NULL,
-          nframe, input->size(1),
-          false,
-          margin
-        );
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }
-      THCudaCheck(cudaGetLastError());
-    }
-    else
-    {
-      THCTensor_(resize0d)(state, output);
-      THCTensor *output_ = THCTensor_(newWithSize1d)(state, input->size(0));  // tmp output buffer
-      if (p == 1)
-      {
-        cunn_MultiMarginCriterion_updateOutput_kernel<1, scalar_t, accreal> <<<blocks,threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
-          THCTensor_(data)(state, output_),
-          THCTensor_(data)(state, input),
-          THCIndexTensor_(data)(state, target),
-          weights ? THCTensor_(data)(state, weights) : NULL,
-          nframe, input->size(1),
-          reduction == at::Reduction::Mean,
-          margin
-        );
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }
-      else if (p == 2)
-      {
-        cunn_MultiMarginCriterion_updateOutput_kernel<2, scalar_t, accreal> <<<blocks,threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
-          THCTensor_(data)(state, output_),
-          THCTensor_(data)(state, input),
-          THCIndexTensor_(data)(state, target),
-          weights ? THCTensor_(data)(state, weights) : NULL,
-          input->size(0), input->size(1),
-          reduction == at::Reduction::Mean,
-          margin
-        );
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }
-      THCudaCheck(cudaGetLastError());
-      auto t = THTensor_wrap(output_);
-      auto r = THTensor_wrap(output);
-      at::native::sum_out(t, at::IntArrayRef(std::vector<int64_t>{}), false, r.scalar_type(), r);
-      THCTensor_(free)(state, output_);
-    }
-  }
-  else
-  {
-    TORCH_CHECK(false, "Expected 2D input with optional zero batch dim, or 1D input with non-zero dims, but got sizes: ",
-    input->sizes());
-  }
-
-  THCTensor_(free)(state, input);
-  if(weights)
-    THCTensor_(free)(state, weights);
-}
-
-void THNN_(MultiMarginCriterion_updateGradInput)(
-           THCState *state,
-           THCTensor *input,
-           THCIndexTensor *target,
-           THCTensor *gradOutput,
-           THCTensor *gradInput,
-           int64_t reduction,
-           int p,
-           THCTensor *weights,
-           accreal margin_)
-{
-  THNN_(MultiMarginCriterion_shapeCheck)(state, input, target);
-  input = THCTensor_(newContiguous)(state, input);
-  THCTensor_(resizeAs)(state, gradInput, input);
-  if (input->numel() == 0) {
-    THCTensor_(free)(state, input);
-    return;
-  }
-  scalar_t margin = ScalarConvert<accreal, scalar_t>::to(margin_);
-  THCUNN_assertSameGPU(state, 3, input, gradInput, target);
-  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-
-  if(weights)
-    weights = THCTensor_(newContiguous)(state, weights);
-
-  if (THTensor_nDimensionLegacyNoScalars(input) == 1)
-  {
-    dim3 blocks(1);
-    dim3 threads(MULTIMARGIN_THREADS);
-
-    if (p == 1)
-    {
-      cunn_MultiMarginCriterion_updateGradInput_kernel<1, scalar_t, accreal> <<<blocks,threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
-        THCTensor_(data)(state, gradInput),
-        THCTensor_(data)(state, gradOutput),
-        THCTensor_(data)(state, input),
-        THCIndexTensor_(data)(state, target),
-        weights ? THCTensor_(data)(state, weights) : NULL,
-        1, THTensor_sizeLegacyNoScalars(gradInput, 0),
-        reduction == at::Reduction::Mean,
-        margin,
-        reduction != at::Reduction::None
-      );
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }
-    else if (p == 2)
-    {
-      cunn_MultiMarginCriterion_updateGradInput_kernel<2, scalar_t, accreal> <<<blocks,threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
-        THCTensor_(data)(state, gradInput),
-        THCTensor_(data)(state, gradOutput),
-        THCTensor_(data)(state, input),
-        THCIndexTensor_(data)(state, target),
-        weights ? THCTensor_(data)(state, weights) : NULL,
-        1, THTensor_sizeLegacyNoScalars(gradInput, 0),
-        reduction == at::Reduction::Mean,
-        margin,
-        reduction != at::Reduction::None
-      );
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }
-    THCudaCheck(cudaGetLastError());
-  }
-  else if (input->dim() == 2)
-  {
-    int nframe = gradInput->size(0);
-    THArgCheck((input->size(1) != 0) && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
-               "inconsistent target size");
-    dim3 blocks(gradInput->size(0));
-    dim3 threads(MULTIMARGIN_THREADS);
-
-    if (p == 1)
-    {
-      cunn_MultiMarginCriterion_updateGradInput_kernel<1, scalar_t, accreal> <<<blocks,threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
-        THCTensor_(data)(state, gradInput),
-        THCTensor_(data)(state, gradOutput),
-        THCTensor_(data)(state, input),
-        THCIndexTensor_(data)(state, target),
-        weights ? THCTensor_(data)(state, weights) : NULL,
-        nframe, gradInput->size(1),
-        reduction == at::Reduction::Mean,
-        margin,
-        reduction != at::Reduction::None
-      );
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }
-    else if (p == 2)
-    {
-      cunn_MultiMarginCriterion_updateGradInput_kernel<2, scalar_t, accreal> <<<blocks,threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
-        THCTensor_(data)(state, gradInput),
-        THCTensor_(data)(state, gradOutput),
-        THCTensor_(data)(state, input),
-        THCIndexTensor_(data)(state, target),
-        weights ? THCTensor_(data)(state, weights) : NULL,
-        nframe, gradInput->size(1),
-        reduction == at::Reduction::Mean,
-        margin,
-        reduction != at::Reduction::None
-      );
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }
-    THCudaCheck(cudaGetLastError());
-  }
-  else
-  {
-    TORCH_CHECK(false, "Expected 2D input with optional zero batch dim, or 1D input with non-zero dims, but got sizes: ",
-    input->sizes());
-  }
-
-  THCTensor_(free)(state, input);
-  THCTensor_(free)(state, gradOutput);
-  if(weights)
-    THCTensor_(free)(state, weights);
-}
-
-#endif

From 99848c72696e4f17104e735ef0a3ee580c71eba7 Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Sat, 10 Jul 2021 19:40:41 -0700
Subject: [PATCH 074/122] [quant] Add tensor_qparam variant to
 fake_quantize_per_tensor (#61317)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61317

Add an overload to fake_quantize_per_tensor that accepts scale/zero_point as input. The reasons to do this are

* required for fused observer + fake_quant operator on GPU where the scale/zero_point will be calculated by the observer on device. Passing tensor inputs enables us to directly access the scale/zero-point value in the cuda kernel to avoid extra copies/malloc
* enables us to pass in float as scale dtype and int32 as zero_point dtype (which is consistent with what the quantize call actually uses) https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/quantized/affine_quantizer_base.cpp#L52-L53
* overload consistent with `quantizer_per_tensor.tensor_qparams`
ghstack-source-id: 133370216

Test Plan:
buck test mode/dev-nosan caffe2/test/:quantization -- test_backward_per_tensor_cachemask
buck test mode/dev-nosan caffe2/test/:quantization -- test_forward_per_tensor_cachemask

Reviewed By: raghuramank100

Differential Revision: D29552727

fbshipit-source-id: cbb9af40fc575ad27a29c646b760d5ee52cc923d
---
 aten/src/ATen/native/native_functions.yaml    |  9 ++++
 .../cpu/kernels/QuantizedOpKernels.cpp        | 41 +++++++++++++++----
 .../quantized/cuda/fake_quantize_core.cu      | 34 +++++++++++++++
 .../ATen/native/quantized/fake_quant_affine.h | 10 +++++
 .../fake_quant_per_tensor_affine.cpp          | 32 +++++++++++++++
 test/quantization/core/test_workflow_ops.py   | 19 +++++----
 test/test_namedtuple_return_api.py            |  4 +-
 tools/autograd/derivatives.yaml               |  3 ++
 torch/quantization/observer.py                |  2 +-
 9 files changed, 137 insertions(+), 17 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 41c68fa24349d..bb3e027fe0123 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -5223,11 +5223,20 @@
   device_check: NoCheck   # TensorIterator
   variants: function
 
+- func: fake_quantize_per_tensor_affine.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
 - func: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
   variants: function
   dispatch:
     CPU, CUDA: fake_quantize_per_tensor_affine_cachemask
 
+- func: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+  variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams
+
 - func: fake_quantize_per_tensor_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor
   variants: function
 
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index 2d2912bce99fd..cf807afb42779 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -2059,14 +2059,15 @@ void q_batch_norm_kernel(
 
 }
 
-void fake_quantize_tensor_cachemask_kernel(
-    Tensor& output,
-    Tensor& mask,
-    const Tensor& input,
-    float sc,
-    int64_t z_point,
-    int64_t quant_min,
-    int64_t quant_max) {
+void _fake_quantize_tensor_helper(
+  Tensor& output,
+  Tensor& mask,
+  const Tensor& input,
+  float sc,
+  int64_t z_point,
+  int64_t quant_min,
+  int64_t quant_max) {
+
   float inv_scale = 1.0f / sc;
 
   auto iter_combined = TensorIteratorConfig()
@@ -2089,7 +2090,28 @@ void fake_quantize_tensor_cachemask_kernel(
       }
     });
   });
+  }
+
+void fake_quantize_tensor_cachemask_kernel(
+    Tensor& output,
+    Tensor& mask,
+    const Tensor& input,
+    float sc,
+    int64_t z_point,
+    int64_t quant_min,
+    int64_t quant_max) {
+  _fake_quantize_tensor_helper(output, mask, input, sc, z_point, quant_min, quant_max);
+}
 
+void fake_quantize_tensor_cachemask_tensor_qparams_kernel(
+    Tensor& output,
+    Tensor& mask,
+    const Tensor& input,
+    const Tensor& sc,
+    const Tensor& z_point,
+    int64_t quant_min,
+    int64_t quant_max) {
+  _fake_quantize_tensor_helper(output, mask, input, sc.item().toFloat(), z_point.item().toInt(), quant_min, quant_max);
 }
 
 void fake_quantize_learnable_tensor_grad_kernel_cpu(
@@ -3084,6 +3106,9 @@ REGISTER_DISPATCH(fake_quant_per_channel_cachemask_stub, &fake_quant_per_channel
 REGISTER_DISPATCH(fake_quant_tensor_cachemask_stub,
                   &fake_quantize_tensor_cachemask_kernel);
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+REGISTER_DISPATCH(fake_quant_tensor_cachemask_tensor_qparams_stub,
+                  &fake_quantize_tensor_cachemask_tensor_qparams_kernel);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 REGISTER_DISPATCH(qadaptive_avg_pool2d_nhwc_stub,
                   &qadaptive_avg_pool2d_nhwc_kernel);
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
diff --git a/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu b/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu
index 9ec3ea25d34a2..6fb69f14ba7d3 100644
--- a/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu
+++ b/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu
@@ -52,6 +52,39 @@ void fake_quantize_tensor_cachemask_kernel_cuda(
   });
 }
 
+void fake_quantize_tensor_cachemask_tensor_qparams_kernel_cuda(
+    Tensor& output,
+    Tensor& mask,
+    const Tensor& input,
+    const Tensor& scale,
+    const Tensor& zero_point,
+    int64_t quant_min,
+    int64_t quant_max) {
+  float* scale_ptr = scale.data_ptr<float>();
+  int32_t* zp_ptr = zero_point.data_ptr<int32_t>();
+  auto iter = TensorIteratorConfig()
+    .check_all_same_dtype(false)
+    .add_output(output)
+    .add_output(mask)
+    .add_input(input)
+    .build();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "fake_quantize_tensor_cachemask_kernel_types", [&] {
+    gpu_kernel_multiple_outputs(
+      iter,
+      [=] GPU_LAMBDA (scalar_t input_val) -> thrust::tuple<scalar_t, bool> {
+        float inv_scale = 1.0f / (*scale_ptr);
+        const auto qval = static_cast<int64_t>(std::nearbyint(input_val * inv_scale) + (*zp_ptr));
+        return {
+          // fake_quantized value
+          (fminf(quant_max, fmaxf(quant_min, qval)) - (*zp_ptr)) * (*scale_ptr),
+          // mask for grad
+          ((quant_min <= qval) && (qval <= quant_max))
+        };
+      }
+    );
+  });
+}
+
 void _fake_quantize_grad_learnable_tensor_kernel_cuda(
     TensorIterator& iter,
     float scale,
@@ -80,6 +113,7 @@ void _fake_quantize_grad_learnable_tensor_kernel_cuda(
 }
 
 REGISTER_DISPATCH(fake_quant_tensor_cachemask_stub, &fake_quantize_tensor_cachemask_kernel_cuda);
+REGISTER_DISPATCH(fake_quant_tensor_cachemask_tensor_qparams_stub, &fake_quantize_tensor_cachemask_tensor_qparams_kernel_cuda);
 REGISTER_DISPATCH(fake_quant_grad_learnable_tensor_stub, &_fake_quantize_grad_learnable_tensor_kernel_cuda);
 
 // Fake quantize per channel
diff --git a/aten/src/ATen/native/quantized/fake_quant_affine.h b/aten/src/ATen/native/quantized/fake_quant_affine.h
index 31963831582fe..3e69d58293bc5 100644
--- a/aten/src/ATen/native/quantized/fake_quant_affine.h
+++ b/aten/src/ATen/native/quantized/fake_quant_affine.h
@@ -18,6 +18,15 @@ using fake_quant_tensor_cachemask_fn = void (*)(
     int64_t quant_min,
     int64_t quant_max);
 
+using fake_quant_tensor_cachemask_tensor_qparams_fn = void (*)(
+    Tensor& output,
+    Tensor& mask,
+    const Tensor& input,
+    const Tensor& sc,
+    const Tensor& z_point,
+    int64_t quant_min,
+    int64_t quant_max);
+
 using fake_quant_learnable_grad_tensor_fn = void (*)(
     TensorIterator& iter,
     float scale,
@@ -28,6 +37,7 @@ using fake_quant_learnable_grad_tensor_fn = void (*)(
     float grad_factor);
 
 DECLARE_DISPATCH(fake_quant_tensor_cachemask_fn, fake_quant_tensor_cachemask_stub);
+DECLARE_DISPATCH(fake_quant_tensor_cachemask_tensor_qparams_fn, fake_quant_tensor_cachemask_tensor_qparams_stub);
 DECLARE_DISPATCH(fake_quant_learnable_grad_tensor_fn, fake_quant_grad_learnable_tensor_stub);
 
 using fake_quant_per_channel_fn = void (*)(
diff --git a/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp b/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp
index c42336be9bcfc..cd235e1b86ddf 100644
--- a/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp
+++ b/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp
@@ -14,6 +14,8 @@ namespace native {
 DEFINE_DISPATCH(fake_quant_tensor_cachemask_stub);
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(fake_quant_grad_learnable_tensor_stub);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DEFINE_DISPATCH(fake_quant_tensor_cachemask_tensor_qparams_stub);
 
 /* Fake-quantizes the 'inputs' tensor.
 
@@ -40,6 +42,17 @@ Tensor fake_quantize_per_tensor_affine(
   return std::get<0>(res);
 }
 
+Tensor fake_quantize_per_tensor_affine(
+    const Tensor& self,
+    const Tensor& scale,
+    const Tensor& zero_point,
+    int64_t quant_min,
+    int64_t quant_max) {
+  const auto res = at::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams(
+      self, scale, zero_point, quant_min, quant_max);
+  return std::get<0>(res);
+}
+
 /* Fake-quantizes the 'inputs' tensor, saving a mask for the backward pass.
 
 This is numerically equivalent to `fake_quantize_per_tensor_affine`,
@@ -79,6 +92,25 @@ std::tuple<Tensor, Tensor> fake_quantize_per_tensor_affine_cachemask(
   return std::make_tuple(Y, mask);
 }
 
+std::tuple<Tensor, Tensor> _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(
+    const Tensor& self,
+    const Tensor& scale,
+    const Tensor& zero_point,
+    int64_t quant_min,
+    int64_t quant_max) {
+  TORCH_CHECK(
+      quant_min <= quant_max,
+      "`quant_min` should be less than or \
+        equal to `quant_max`.");
+  auto Y = at::empty_like(self, self.options(), MemoryFormat::Preserve);
+  auto mask = at::empty_like(self, at::kBool, MemoryFormat::Preserve);
+  fake_quant_tensor_cachemask_tensor_qparams_stub(
+      self.device().type(), Y, mask, self, scale, zero_point, quant_min, quant_max);
+  // TODO(future, optional): look into packing the mask further (BoolTensor uses
+  //   1 byte per element, we only need 1 bit per element).
+  return std::make_tuple(Y, mask);
+}
+
 /* Backward path to fake-quantize the 'inputs' tensor, with mask.
 
 Args:
diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py
index 2b275fde5b982..adde8508224b2 100644
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@@ -234,19 +234,21 @@ def _test_forward_per_tensor_cachemask_impl(self, device):
         float_types = (torch.float32, torch.float16, torch.float64)
         torch_types = (torch.qint8, torch.quint8)
         Xs = (torch.randn(4, 8, device=device), torch.randn(4, 16, device=device)[:, ::2])
-        for float_type, torch_type, X in itertools.product(float_types, torch_types, Xs):
+        tensor_qparam = (True, False)
+        for float_type, torch_type, X, tensor_qparams in itertools.product(float_types, torch_types, Xs, tensor_qparam):
             # pick the scale + zp so that some values get clipped
             X = X.to(float_type)
             obs = torch.quantization.MinMaxObserver(torch_type)
+            obs.to(device)
             obs(X * 0.75)
             scale, zero_point = obs.calculate_qparams()
-            scale, zero_point = float(scale), int(zero_point)
             quant_min, quant_max = obs._calculate_qmin_qmax()
-
+            if not tensor_qparam:
+                scale, zero_point = float(scale), int(zero_point)
             Y_test = torch.fake_quantize_per_tensor_affine(
                 X, scale, zero_point, quant_min, quant_max)
             Y_ref = _fake_quantize_per_tensor_affine_reference(
-                X.cpu(), scale, zero_point, quant_min, quant_max).to(device)
+                X, scale, zero_point, quant_min, quant_max).to(device)
             self.assertTrue(torch.allclose(Y_test, Y_ref, rtol=tolerance, atol=tolerance))
             self.assertTrue(Y_test.dtype == float_type)
 
@@ -262,21 +264,24 @@ def test_forward_per_tensor_cachemask_cuda(self):
     def _test_backward_per_tensor_cachemask_impl(self, device):
         float_types = (torch.float32, torch.float16, torch.float64)
         torch_types = (torch.qint8, torch.quint8)
-        for float_type, torch_type in itertools.product(float_types, torch_types):
+        tensor_qparam = (True, False)
+        for float_type, torch_type, tensor_qparam in itertools.product(float_types, torch_types, tensor_qparam):
             X = torch.randn(4, 8).to(device).to(float_type)
             X.requires_grad_()
             # pick the scale + zp so that some values get clipped
             obs = torch.quantization.MinMaxObserver(torch_type)
+            obs.to(device)
             obs(X * 0.75)
             scale, zero_point = obs.calculate_qparams()
-            scale, zero_point = float(scale), int(zero_point)
+            if not tensor_qparam:
+                scale, zero_point = float(scale), int(zero_point)
             quant_min, quant_max = obs._calculate_qmin_qmax()
 
             # forward pass
             Y_test = torch.fake_quantize_per_tensor_affine(
                 X, scale, zero_point, quant_min, quant_max)
             Y_ref = _fake_quantize_per_tensor_affine_reference(
-                X.cpu(), scale, zero_point, quant_min, quant_max).to(device)
+                X, scale, zero_point, quant_min, quant_max).to(device)
             self.assertTrue(torch.allclose(Y_test, Y_ref, rtol=tolerance, atol=tolerance))
 
             # backward pass
diff --git a/test/test_namedtuple_return_api.py b/test/test_namedtuple_return_api.py
index c8cb454de327c..94e55ca287a08 100644
--- a/test/test_namedtuple_return_api.py
+++ b/test/test_namedtuple_return_api.py
@@ -16,7 +16,7 @@
     'triangular_solve', 'cummax', 'cummin', 'linalg_eigh', "_unpack_dual", 'linalg_qr',
     '_svd_helper', 'linalg_svd', 'linalg_slogdet', 'fake_quantize_per_tensor_affine_cachemask',
     'fake_quantize_per_channel_affine_cachemask', 'linalg_lstsq', 'linalg_eig', 'linalg_cholesky_ex',
-    'frexp', 'lu_unpack', 'histogram'
+    'frexp', 'lu_unpack', 'histogram', '_fake_quantize_per_tensor_affine_cachemask_tensor_qparams'
 }
 
 
@@ -88,6 +88,8 @@ def test_namedtuple_return(self):
                input=(torch.tensor([3, 2, 1, 4, 5], dtype=torch.int32), True, True),
                names=('P', 'L', 'U'), hasout=True),
             op(operators=['histogram'], input=(1,), names=('hist', 'bin_edges'), hasout=True),
+            op(operators=['_fake_quantize_per_tensor_affine_cachemask_tensor_qparams'],
+               input=(torch.tensor([1.0]), torch.tensor([0], dtype=torch.int), 0, 255), names=('output', 'mask',), hasout=False),
         ]
 
         def get_func(f):
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index c40418f5456e7..5abb1a1446da5 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -553,6 +553,9 @@
 - name: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
   self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask)
 
+- name: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+  self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask)
+
 - name: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor
   self, scale, zero_point: "grad.defined() ? _fake_quantize_learnable_per_tensor_affine_backward(grad, self, scale, zero_point, quant_min, quant_max, grad_factor) : std::tuple<Tensor, Tensor, Tensor>()"
 
diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py
index 50566c0bcbc82..cf9bbb1aa3342 100644
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@@ -309,7 +309,7 @@ def _calculate_qparams(
         else:
             scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min)
             scale = torch.max(scale, self.eps)
-            zero_point = quant_min - torch.round(min_val_neg / scale)
+            zero_point = quant_min - torch.round(min_val_neg / scale).to(torch.int)
             zero_point = torch.clamp(zero_point, quant_min, quant_max)
 
         # For scalar values, cast them to Tensors of size 1 to keep the shape

From 7a15576a65715e7bcba017e5c2fa755610a0db22 Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Sat, 10 Jul 2021 19:40:41 -0700
Subject: [PATCH 075/122] [quant] update FakeQuant modules to use tensor
 qparams (#61318)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61318

Remove the `float()` and `int()` calls in the forward function so that we can directly use the tensor qparams in the fake_quantize operator.

Calling `float()/int()` internally calls `item()` which can trigger a gpu-> cpu copy if the original tensors reside on GPU.
Local benchmark P427668213

Before this change
```
                                               Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
---------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                     aten::_aminmax         2.57%       1.507ms         3.10%       1.819ms      36.371us       2.872ms         4.81%       2.872ms      57.446us            50
              aten::fake_quantize_per_tensor_affine         1.04%     610.915us         3.60%       2.114ms      42.276us     472.896us         0.79%       2.698ms      53.962us            50
    aten::fake_quantize_per_tensor_affine_cachemask         1.69%     993.626us         2.56%       1.503ms      30.058us       2.225ms         3.73%       2.225ms      44.504us            50
                                   aten::is_nonzero         3.85%       2.258ms        19.68%      11.540ms      46.161us       2.168ms         3.63%      11.084ms      44.336us           250
                                   aten::zeros_like         1.82%       1.064ms         6.65%       3.901ms      39.007us       1.531ms         2.57%       3.905ms      39.045us           100
                                           aten::eq        13.80%       8.093ms        25.90%      15.189ms      37.972us       9.580ms        16.05%      15.566ms      38.914us           400
                                         aten::item         5.67%       3.323ms        21.50%      12.607ms      36.019us       3.233ms         5.42%      12.167ms      34.762us           350
                                        aten::zeros         0.94%     549.208us         2.93%       1.717ms      34.343us     688.928us         1.15%       1.695ms      33.894us            50
                                           aten::le         2.52%       1.478ms         4.50%       2.641ms      26.411us       1.753ms         2.94%       2.845ms      28.448us           100
                                         aten::rsub         1.04%     608.715us         2.44%       1.433ms      28.667us     532.000us         0.89%       1.418ms      28.353us            50
                                          aten::max         1.54%     905.401us         4.62%       2.711ms      27.106us     847.488us         1.42%       2.697ms      26.969us           100
                                         aten::ones         0.92%     542.159us         2.16%       1.266ms      25.324us     661.856us         1.11%       1.301ms      26.017us            50
                                          aten::min         0.82%     479.167us         2.15%       1.258ms      25.160us     407.808us         0.68%       1.276ms      25.530us            50
                          aten::_local_scalar_dense        15.83%       9.284ms        15.83%       9.284ms      26.526us       8.934ms        14.97%       8.934ms      25.524us           350
                                        aten::clamp         2.35%       1.378ms         4.21%       2.467ms      24.669us       1.546ms         2.59%       2.461ms      24.612us           100
                                        aten::zero_         2.53%       1.482ms         5.65%       3.316ms      22.108us       1.326ms         2.22%       3.380ms      22.531us           150
                                      aten::maximum         3.08%       1.805ms         3.08%       1.805ms      18.052us       1.849ms         3.10%       1.849ms      18.494us           100
                                      aten::minimum         1.33%     778.854us         1.33%     778.854us      15.577us     868.672us         1.46%     868.672us      17.373us            50
                                        aten::round         1.36%     799.910us         1.36%     799.910us      15.998us     809.568us         1.36%     809.568us      16.191us            50
                                        aten::copy_         6.61%       3.878ms         6.61%       3.878ms      15.513us       4.036ms         6.76%       4.036ms      16.143us           250
                                          aten::div         2.53%       1.483ms         2.53%       1.483ms      14.833us       1.535ms         2.57%       1.535ms      15.353us           100
                                          aten::mul         2.44%       1.431ms         2.44%       1.431ms      14.314us       1.478ms         2.48%       1.478ms      14.782us           100
                                       aten::detach         1.46%     855.670us         2.41%       1.411ms      14.110us     832.448us         1.39%       1.395ms      13.949us           100
                                          aten::add         2.22%       1.301ms         2.22%       1.301ms      13.008us       1.383ms         2.32%       1.383ms      13.828us           100
                                        aten::fill_         4.18%       2.452ms         4.18%       2.452ms      12.262us       2.693ms         4.51%       2.693ms      13.463us           200
                                          aten::sub         5.06%       2.967ms         5.06%       2.967ms      14.837us       2.675ms         4.48%       2.675ms      13.374us           200
                                           aten::to         2.10%       1.230ms         3.65%       2.140ms      10.701us       1.310ms         2.20%       2.062ms      10.310us           200
                                       aten::select         1.28%     749.144us         1.49%     874.227us       8.742us     863.232us         1.45%     863.232us       8.632us           100
                                             detach         0.95%     555.326us         0.95%     555.326us       5.553us     562.496us         0.94%     562.496us       5.625us           100
                                   aten::as_strided         0.40%     232.289us         0.40%     232.289us       1.161us       0.000us         0.00%       0.000us       0.000us           200
                                        aten::empty         2.93%       1.720ms         2.93%       1.720ms       3.439us       0.000us         0.00%       0.000us       0.000us           500
                                      aten::resize_         1.04%     611.313us         1.04%     611.313us       2.038us       0.000us         0.00%       0.000us       0.000us           300
                                   aten::empty_like         0.75%     438.585us         1.77%       1.036ms       5.180us       0.000us         0.00%       0.000us       0.000us           200
                                aten::empty_strided         1.36%     799.442us         1.36%     799.442us       3.198us       0.000us         0.00%       0.000us       0.000us           250
---------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
Self CPU time total: 58.645ms
Self CUDA time total: 59.674ms
```

After this change
```

test_fake_quant_profiler (scripts.supriyar.benchmark.module_bench.ProfilerBench) ... -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                  aten::fake_quantize_per_tensor_affine         0.98%     505.210us         4.38%       2.259ms      45.187us     419.424us         0.78%       3.218ms      64.367us            50
                                         aten::_aminmax         2.78%       1.434ms         3.42%       1.766ms      35.321us       2.825ms         5.27%       2.825ms      56.505us            50
aten::fake_quantize_per_tensor_affine_cachemask_tens...         2.38%       1.229ms         3.40%       1.754ms      35.083us       2.799ms         5.22%       2.799ms      55.979us            50
                                             aten::rsub         0.94%     485.040us         5.02%       2.590ms      51.793us     458.976us         0.86%       2.587ms      51.747us            50
                                       aten::is_nonzero         3.78%       1.952ms        23.64%      12.196ms      48.786us       2.055ms         3.83%      11.986ms      47.944us           250
                                             aten::item         6.92%       3.572ms        19.86%      10.244ms      40.977us       3.670ms         6.85%       9.931ms      39.724us           250
                                       aten::zeros_like         1.65%     848.874us         6.64%       3.426ms      34.260us       1.397ms         2.61%       3.572ms      35.717us           100
                                            aten::zeros         0.85%     436.691us         3.00%       1.549ms      30.984us     551.936us         1.03%       1.576ms      31.516us            50
                                               aten::eq        10.60%       5.467ms        20.26%      10.452ms      26.130us       7.018ms        13.09%      10.832ms      27.079us           400
                                               aten::le         2.58%       1.332ms         4.67%       2.407ms      24.074us       1.580ms         2.95%       2.614ms      26.144us           100
                              aten::_local_scalar_dense        12.93%       6.673ms        12.93%       6.673ms      26.691us       6.261ms        11.68%       6.261ms      25.046us           250
                                            aten::clamp         2.43%       1.253ms         4.37%       2.256ms      22.560us       1.431ms         2.67%       2.273ms      22.725us           100
                                             aten::ones         0.89%     460.133us         2.18%       1.123ms      22.467us     570.496us         1.06%       1.128ms      22.551us            50
                                              aten::min         0.74%     383.132us         2.06%       1.065ms      21.296us     377.536us         0.70%       1.091ms      21.824us            50
                                            aten::zero_         2.36%       1.219ms         5.87%       3.029ms      20.194us       1.261ms         2.35%       3.199ms      21.327us           150
                                              aten::max         1.51%     779.081us         4.06%       2.096ms      20.960us     791.680us         1.48%       2.130ms      21.295us           100
                                              aten::sub         7.97%       4.111ms         7.97%       4.111ms      20.556us       3.847ms         7.18%       3.847ms      19.234us           200
                                              aten::div         2.94%       1.516ms         2.94%       1.516ms      15.158us       1.580ms         2.95%       1.580ms      15.798us           100
                                            aten::round         1.45%     750.445us         1.45%     750.445us      15.009us     756.064us         1.41%     756.064us      15.121us            50
                                            aten::copy_         6.88%       3.548ms         6.88%       3.548ms      14.190us       3.701ms         6.90%       3.701ms      14.803us           250
                                          aten::minimum         1.32%     681.654us         1.32%     681.654us      13.633us     713.664us         1.33%     713.664us      14.273us            50
                                          aten::maximum         2.55%       1.317ms         2.55%       1.317ms      13.169us       1.338ms         2.50%       1.338ms      13.378us           100
                                              aten::mul         2.63%       1.358ms         2.63%       1.358ms      13.581us       1.328ms         2.48%       1.328ms      13.283us           100
                                           aten::detach         1.34%     688.820us         2.35%       1.211ms      12.110us     772.800us         1.44%       1.278ms      12.779us           100
                                            aten::fill_         4.53%       2.338ms         4.53%       2.338ms      11.692us       2.495ms         4.65%       2.495ms      12.473us           200
                                              aten::add         2.32%       1.197ms         2.32%       1.197ms      11.968us       1.240ms         2.31%       1.240ms      12.405us           100
                                               aten::to         2.07%       1.069ms         3.66%       1.889ms       9.443us       1.224ms         2.28%       1.975ms       9.874us           200
                                           aten::select         1.44%     743.042us         1.64%     848.207us       8.482us     641.600us         1.20%     641.600us       6.416us           100
                                                 detach         1.01%     522.155us         1.01%     522.155us       5.222us     505.088us         0.94%     505.088us       5.051us           100
                                       aten::as_strided         0.44%     227.884us         0.44%     227.884us       1.139us       0.000us         0.00%       0.000us       0.000us           200
                                            aten::empty         3.20%       1.652ms         3.20%       1.652ms       3.304us       0.000us         0.00%       0.000us       0.000us           500
                                          aten::resize_         1.25%     646.711us         1.25%     646.711us       2.156us       0.000us         0.00%       0.000us       0.000us           300
                                       aten::empty_like         0.79%     407.768us         2.07%       1.067ms       5.334us       0.000us         0.00%       0.000us       0.000us           200
                                    aten::empty_strided         1.52%     785.788us         1.52%     785.788us       3.143us       0.000us         0.00%       0.000us       0.000us           250
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
Self CPU time total: 51.590ms
Self CUDA time total: 53.609ms
ghstack-source-id: 133370215

Test Plan: buck test mode/dev-nosan caffe2/test/:quantization

Reviewed By: raghuramank100

Differential Revision: D29566512

fbshipit-source-id: 1aefca51f99949da7334bcfe504848275c9f952c
---
 .../cpu/kernels/QuantizedOpKernels.cpp          |  4 ++--
 .../quantized/fake_quant_per_channel_affine.cpp | 10 +++++-----
 .../operator_benchmark/pt/quantization_test.py  |  2 +-
 test/onnx/test_pytorch_onnx_onnxruntime.py      |  2 +-
 test/quantization/core/test_workflow_ops.py     | 17 +++++++++--------
 torch/quantization/fake_quantize.py             | 14 +++++++-------
 6 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index cf807afb42779..05785823690b0 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -2176,14 +2176,14 @@ void fake_quant_per_channel_cachemask_cpu(
   //   for simplicity, as we do not expect this to be a bottleneck.
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "fake_quantize_channel_cachemask_cpu_type_handling", [&] {
     // write mask
-    cpu_kernel(iter_mask, [=](scalar_t self, float scale, int64_t zero_point) -> bool {
+    cpu_kernel(iter_mask, [=](scalar_t self, float scale, int32_t zero_point) -> bool {
       float inv_scale = 1.0f / scale;
       const auto qval = static_cast<int64_t>(zero_point + std::nearbyint(self * inv_scale));
       return ((quant_min <= qval) && (qval <= quant_max));
     });
 
     // write fake_quant
-    cpu_kernel(iter, [=](scalar_t self, float scale, int64_t zero_point) -> scalar_t {
+    cpu_kernel(iter, [=](scalar_t self, float scale, int32_t zero_point) -> scalar_t {
       float inv_scale = 1.0f / scale;
       // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
       return (std::fmin(
diff --git a/aten/src/ATen/native/quantized/fake_quant_per_channel_affine.cpp b/aten/src/ATen/native/quantized/fake_quant_per_channel_affine.cpp
index d5e49dd24c342..f136aa1633b83 100644
--- a/aten/src/ATen/native/quantized/fake_quant_per_channel_affine.cpp
+++ b/aten/src/ATen/native/quantized/fake_quant_per_channel_affine.cpp
@@ -50,8 +50,8 @@ std::tuple<Tensor, Tensor> fake_quantize_per_channel_affine_cachemask(
     int64_t axis,
     int64_t quant_min,
     int64_t quant_max) {
-  TORCH_CHECK(zero_point.scalar_type() == ScalarType::Long,
-              "Zero-point must be Long, found ", zero_point.scalar_type());
+  TORCH_CHECK(zero_point.scalar_type() == ScalarType::Int,
+              "Zero-point must be Int32, found ", zero_point.scalar_type());
   TORCH_CHECK(scale.dim() == 1, "scale should be a 1-D tensor");
   TORCH_CHECK(zero_point.dim() == 1, "zero point should be a 1-D tensor");
   TORCH_CHECK(
@@ -67,8 +67,8 @@ std::tuple<Tensor, Tensor> fake_quantize_per_channel_affine_cachemask(
         equal to `quant_max`.");
 
   TORCH_CHECK(
-      at::min(zero_point).item().toLong() >= quant_min &&
-          at::max(zero_point).item().toLong() <= quant_max,
+      at::min(zero_point).item().toInt() >= quant_min &&
+          at::max(zero_point).item().toInt() <= quant_max,
       "`zero_point` must be between `quant_min` and `quant_max`.");
 
   TORCH_CHECK(
@@ -145,7 +145,7 @@ Tensor _fake_quantize_learnable_per_channel_affine(
     int64_t quant_min,
     int64_t quant_max,
     double grad_factor) {
-  Tensor zero_point_rounded = _get_rounded_zero_point(zero_point, quant_min, quant_max).to(at::kLong);
+  Tensor zero_point_rounded = _get_rounded_zero_point(zero_point, quant_min, quant_max).to(at::kInt);
   return native::fake_quantize_per_channel_affine(
     self, scale, zero_point_rounded, axis, quant_min, quant_max);
 }
diff --git a/benchmarks/operator_benchmark/pt/quantization_test.py b/benchmarks/operator_benchmark/pt/quantization_test.py
index 0b9c9508298d6..03c753ceebbcf 100644
--- a/benchmarks/operator_benchmark/pt/quantization_test.py
+++ b/benchmarks/operator_benchmark/pt/quantization_test.py
@@ -276,7 +276,7 @@ def init(self, N, C, H, W, nbits, device, op_func):
 
         if op_func.__name__ == 'fakeQuantizePerChannelOriginalKernel':
             self.scale = torch.ones(C, device=device, dtype=torch.float32, requires_grad=False)
-            self.zero_point = torch.zeros(C, device=device, dtype=torch.int64, requires_grad=False)
+            self.zero_point = torch.zeros(C, device=device, dtype=torch.int32, requires_grad=False)
         else:
             self.scale = nn.Parameter(torch.ones(C, device=device, dtype=torch.float32), requires_grad=self.auto_set())
             self.zero_point = nn.Parameter(torch.zeros(C, device=device, dtype=torch.float32), requires_grad=self.auto_set())
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 422ecfb6099fa..cbbc00bfae127 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -7563,7 +7563,7 @@ class FakeQuantizePerChannelModel(torch.nn.Module):
             def forward(self, input):
                 amax = torch.ones(4)
                 scale = amax / 127.
-                zero_point = torch.zeros_like(amax, dtype=torch.long)
+                zero_point = torch.zeros_like(amax, dtype=torch.int)
                 # Quantize twice to test differnet branches
                 y = torch.fake_quantize_per_channel_affine(input, scale, zero_point, 1, 0, 255)
                 return torch.fake_quantize_per_channel_affine(y, scale, zero_point, 1, -128, 127)
diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py
index adde8508224b2..f7c73222a6631 100644
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@@ -96,7 +96,7 @@ def _fake_quantize_learnable_per_channel_affine_grad_reference(
     - https://arxiv.org/pdf/1902.08153.pdf
     - https://arxiv.org/pdf/1903.08066.pdf
     """
-    per_channel_zero_point = ((per_channel_zero_point.detach() + 0.5).clamp(quant_min, quant_max)).type(torch.int64)
+    per_channel_zero_point = ((per_channel_zero_point.detach() + 0.5).clamp(quant_min, quant_max)).type(torch.int32)
     grad_X = _fake_quantize_per_channel_affine_grad_reference(
         dY, X, per_channel_scale, per_channel_zero_point, axis, quant_min, quant_max).to(device)
     per_channel_scale = per_channel_scale.detach().type(torch.float)
@@ -309,7 +309,7 @@ def _test_learnable_forward_per_tensor(self, X, device, scale_base, zero_point_b
 
             X = X_base.clone().float()
             scale_base = scale_base.to(device).float()
-            zero_point_base = zero_point_base.to(dtype=torch.int64, device=device)
+            zero_point_base = zero_point_base.to(dtype=torch.int32, device=device)
             scale = scale_base.clone()
             zero_point = zero_point_base.clamp(quant_min, quant_max)
 
@@ -445,6 +445,7 @@ def test_fixed_qparams_fq_module(self, device, X):
         X, (scale, zero_point, torch_type) = X
         X = to_tensor(X, device)
         fq_module = default_affine_fixed_qparams_fake_quant()
+        fq_module.to(device)
         fixed_scale = fq_module.scale.clone()
         fixed_zero_point = fq_module.zero_point.clone()
         # run fq module and make sure the quantization parameters does not change
@@ -585,7 +586,7 @@ def test_forward_per_channel(self, device, X):
 
         X = to_tensor(X, device)
         scale = to_tensor(scale, device)
-        zero_point = torch.tensor(zero_point).to(dtype=torch.int64, device=device)
+        zero_point = torch.tensor(zero_point).to(dtype=torch.int32, device=device)
         Y = _fake_quantize_per_channel_affine_reference(X.cpu(), scale.cpu(), zero_point.cpu(), axis, quant_min, quant_max)
         Y_prime = torch.fake_quantize_per_channel_affine(
             X, scale, zero_point, axis, quant_min, quant_max)
@@ -602,7 +603,7 @@ def _test_forward_per_channel_cachemask_impl(self, device):
             obs(X * 0.75)
             scale, zero_point = obs.calculate_qparams()
             # TODO(future PR): fix the wrong dtype in obs.calculate_qparams and remove the cast
-            zero_point = zero_point.to(torch.int64)
+            zero_point = zero_point.to(torch.int32)
             quant_min, quant_max = obs._calculate_qmin_qmax()
 
             Y = _fake_quantize_per_channel_affine_reference(
@@ -621,7 +622,7 @@ def test_forward_per_channel_cachemask_cuda(self):
 
     def test_forward_per_channel_half_precision_numerics(self):
         scale = torch.randn(5).abs()
-        zero = torch.randn(5).to(dtype=torch.long)
+        zero = torch.randn(5).to(dtype=torch.int)
         axis = 1
         mini = 0
         maxi = 255
@@ -708,7 +709,7 @@ def test_backward_per_channel(self, device, X):
 
         X = to_tensor(X, device)
         scale = to_tensor(scale, device)
-        zero_point = torch.tensor(zero_point).to(dtype=torch.int64, device=device)
+        zero_point = torch.tensor(zero_point).to(dtype=torch.int32, device=device)
         X.requires_grad_()
         Y_prime = torch.fake_quantize_per_channel_affine(
             X, scale, zero_point, axis, quant_min, quant_max)
@@ -729,7 +730,7 @@ def _test_backward_per_channel_cachemask_impl(self, device):
             obs(X * 0.75)
             scale, zero_point = obs.calculate_qparams()
             # TODO(future PR): fix the wrong dtype in obs.calculate_qparams and remove the cast
-            zero_point = zero_point.to(torch.int64)
+            zero_point = zero_point.to(torch.int32)
             quant_min, quant_max = obs._calculate_qmin_qmax()
             X.requires_grad_()
             Y_prime = torch.fake_quantize_per_channel_affine(
@@ -837,7 +838,7 @@ def _test_numerical_consistency(self, test_type):
         torch.random.manual_seed(NP_RANDOM_SEED)
         torch_types = [torch.qint8, torch.quint8]
         float_types = [torch.float, torch.float16, torch.float64]
-        zero_types = [torch.long]
+        zero_types = [torch.int]
         devices = [torch.device('cpu'), torch.device('cuda')] if torch.cuda.is_available() else [torch.device('cpu')]
         axis = 1
         for i in range(20):
diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py
index 917748ff9ea26..6001023994c9f 100644
--- a/torch/quantization/fake_quantize.py
+++ b/torch/quantization/fake_quantize.py
@@ -109,8 +109,8 @@ def __init__(self, observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=
         self.activation_post_process = observer(**observer_kwargs)
         assert torch.iinfo(self.activation_post_process.dtype).min <= quant_min, 'quant_min out of bound'
         assert quant_max <= torch.iinfo(self.activation_post_process.dtype).max, 'quant_max out of bound'
-        self.register_buffer('scale', torch.tensor([1.0]))
-        self.register_buffer('zero_point', torch.tensor([0]))
+        self.register_buffer('scale', torch.tensor([1.0], dtype=torch.float))
+        self.register_buffer('zero_point', torch.tensor([0], dtype=torch.int))
         self.dtype = self.activation_post_process.dtype
         self.qscheme = self.activation_post_process.qscheme
         self.ch_axis = self.activation_post_process.ch_axis \
@@ -143,7 +143,7 @@ def forward(self, X):
                     self.ch_axis, self.quant_min, self.quant_max)
             else:
                 X = torch.fake_quantize_per_tensor_affine(
-                    X, float(self.scale), int(self.zero_point),
+                    X, self.scale, self.zero_point,
                     self.quant_min, self.quant_max)
         return X
 
@@ -218,8 +218,8 @@ def __init__(self,
         assert quant_min <= quant_max, 'quant_min should be less than or equal to quant_max'
         self.quant_min = quant_min
         self.quant_max = quant_max
-        self.register_buffer('scale', torch.tensor([scale]))
-        self.register_buffer('zero_point', torch.tensor([zero_point]))
+        self.register_buffer('scale', torch.tensor([scale], dtype=torch.float))
+        self.register_buffer('zero_point', torch.tensor([zero_point], dtype=torch.int))
         self.dtype = dtype
         self.qscheme = qscheme
         assert _is_per_tensor(self.qscheme), 'Only per tensor quantization is supported' + \
@@ -227,8 +227,8 @@ def __init__(self,
 
     def forward(self, X):
         if self.fake_quant_enabled[0] == 1:
-            X = torch.fake_quantize_per_tensor_affine(X, float(self.scale),
-                                                      int(self.zero_point), self.quant_min,
+            X = torch.fake_quantize_per_tensor_affine(X, self.scale,
+                                                      self.zero_point, self.quant_min,
                                                       self.quant_max)
         return X
 

From 292ee652615047a5cba0d1b864ddca978d5b536a Mon Sep 17 00:00:00 2001
From: Zeina Migeed <migeedz@fb.com>
Date: Sat, 10 Jul 2021 20:04:51 -0700
Subject: [PATCH 076/122] add maxpool2D, add more tests, handle integer
 parameters for maxpool2D (#61188)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61188

Test Plan: Imported from OSS

Reviewed By: jamesr66a

Differential Revision: D29626303

Pulled By: migeed-z

fbshipit-source-id: 32309cd1eb1189beaba63017653b3aeccdf2761d
---
 test/fx/test_gradual_type.py                  | 140 ++++++++++++++++++
 .../experimental/graph_gradual_typechecker.py |  64 +++++++-
 2 files changed, 197 insertions(+), 7 deletions(-)

diff --git a/test/fx/test_gradual_type.py b/test/fx/test_gradual_type.py
index 9946303548b2e..a2066fc488d9b 100644
--- a/test/fx/test_gradual_type.py
+++ b/test/fx/test_gradual_type.py
@@ -551,5 +551,145 @@ def forward(self, x: TensorType((2, 2, 4, 5))):
                 assert isinstance(n.type, TensorType)
                 assert torch.Size(n.type.__args__) == B.forward(torch.rand(2, 2, 4, 5)).size()
 
+    def test_type_check_conv2D_and_maxpool2d(self):
+
+        class BasicBlock(torch.nn.Module):
+            def __init__(self):
+                super(BasicBlock, self).__init__()
+
+                self.conv1 = torch.nn.Conv2d(3, 6, 5)
+                self.pool = torch.nn.MaxPool2d(2, 2)
+
+            def forward(self, x : TensorType((4, 3, 32, 32))):
+                out = self.conv1(x)
+                out = self.pool(out)
+                return out
+
+        B = BasicBlock()
+        ast_rewriter = RewritingTracer()
+        graph = ast_rewriter.trace(B)
+        traced = GraphModule(ast_rewriter.root, graph, "gm")
+        tc = GraphTypeChecker({}, traced)
+        tc.type_check()
+
+        expected_ph_types = [TensorType((4, 3, 32, 32)), TensorType((4, 6, 28, 28)),
+                             TensorType((4, 6, 14, 14)), TensorType((4, 6, 14, 14))]
+        expected_iter = iter(expected_ph_types)
+
+        for n in traced.graph.nodes:
+            assert n.type == next(expected_iter)
+
+
+
+    def test_type_typechecl_maxpool2d_3dinput(self):
+
+        class BasicBlock(torch.nn.Module):
+            def __init__(self):
+                super(BasicBlock, self).__init__()
+                self.pool = torch.nn.MaxPool2d(5, 8)
+
+            def forward(self, x : TensorType((64, 8, 8))):
+                out = self.pool(x)
+                return out
+
+        B = BasicBlock()
+        ast_rewriter = RewritingTracer()
+        graph = ast_rewriter.trace(B)
+        traced = GraphModule(ast_rewriter.root, graph, "gm")
+        tc = GraphTypeChecker({}, traced)
+        tc.type_check()
+
+        for n in traced.graph.nodes:
+            if n.target == 'output':
+                assert n.type == TensorType((64, 1, 1))
+
+    def test_type_maxpool2d_fully_static(self):
+        annotation_list = [(Dyn, Dyn, 3, 5), (2, 5, 6, 9), (10, 15, 13, 14),
+                           (10, Dyn, 13, 14), (Dyn, Dyn, Dyn, 10)]
+        input_list = [(1, 2, 3, 5), (2, 5, 6, 9), (10, 15, 13, 14),
+                      (10, 15, 13, 14), (2, 2, 10, 10)]
+        intermediate_types = [(1, 2, Dyn, Dyn), (2, Dyn, 2, 4), (10, 15, Dyn, 2),
+                              (10, 15, 2, 3), (2, Dyn, Dyn, Dyn)]
+        stride_list = [1, 2, 3, 2, 1]
+        dilation_list = [1, 2, 3, 3, 2]
+        padding_list = [1, 2, 3, 3, 1]
+        kernel_size_list = [2, 4, 6, 6, 3]
+        output_types = [(1, 2, 4, 6), (2, 5, 2, 4), (10, 15, 2, 2), (10, 15, 2, 3), (2, Dyn, Dyn, 8)]
+
+        for i in range(5):
+            annotation = annotation_list[i]
+            input = input_list[i]
+            stride = stride_list[i]
+            dilation = dilation_list[i]
+            padding = padding_list[i]
+            kernel_size = kernel_size_list[i]
+            intermediate_type = intermediate_types[i]
+
+            class BasicBlock(torch.nn.Module):
+                def __init__(self, kernel_size, stride, padding, dilation):
+                    super(BasicBlock, self).__init__()
+                    self.pool = torch.nn.MaxPool2d(kernel_size, stride=stride,
+                                                   padding=padding, dilation=dilation,
+                                                   return_indices=False, ceil_mode=False)
+
+                def forward(self, x):
+                    out = self.pool(x)
+                    return out
+
+            B = BasicBlock(kernel_size, stride, padding, dilation)
+            ast_rewriter = RewritingTracer()
+            graph = ast_rewriter.trace(B)
+            traced = GraphModule(ast_rewriter.root, graph, "gm")
+
+            # annotate our argument
+            for n in graph.nodes:
+                if n.op == 'placeholder':
+                    n.type = TensorType(annotation)
+
+            b = B.forward(torch.rand(input))
+            tc = GraphTypeChecker({}, traced)
+            tc.type_check()
+
+            for n in graph.nodes:
+                if n.op == 'output':
+                    assert is_consistent(n.type, TensorType(b.size()))
+
+            # test with intermediate annotations
+            class BasicBlock(torch.nn.Module):
+                def __init__(self, kernel_size, stride, padding, dilation):
+                    super(BasicBlock, self).__init__()
+                    self.pool = torch.nn.MaxPool2d(kernel_size, stride=stride,
+                                                   padding=padding, dilation=dilation,
+                                                   return_indices=False, ceil_mode=False)
+
+                def forward(self, x):
+                    out = self.pool(x)
+                    return out
+
+            B = BasicBlock(kernel_size, stride, padding, dilation)
+            ast_rewriter = RewritingTracer()
+            graph = ast_rewriter.trace(B)
+            traced = GraphModule(ast_rewriter.root, graph, "gm")
+
+            # annotate our argument
+            for n in graph.nodes:
+                if n.op == 'placeholder':
+                    n.type = TensorType(annotation)
+
+            # populate our intermediate notes
+            for n in traced.graph.nodes:
+                if n.op == 'call_module':
+                    n.type = TensorType(intermediate_type)
+
+            tc = GraphTypeChecker({}, traced)
+            tc.type_check()
+
+            for n in traced.graph.nodes:
+                if n.op == 'output':
+                    assert n.type == TensorType(output_types[i])
+                    assert is_consistent(n.type, TensorType(b.size()))
+
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py
index d505230198501..e7295745e4fa2 100644
--- a/torch/fx/experimental/graph_gradual_typechecker.py
+++ b/torch/fx/experimental/graph_gradual_typechecker.py
@@ -207,19 +207,31 @@ def bn2d_inference_rule(n: Node, module_instance):
     else:
         raise TypeError(f'Cannot apply {module_instance} with input type {arg_type} and existing type {n.type} on {n}')
 
+
 def calculate(d_in, module_instance, index):
     """
     For calculating h_in and w_out.
     """
+
+    padding = (module_instance.padding, module_instance.padding) \
+        if isinstance(module_instance.padding, int) else module_instance.padding
+    kernel_size = (module_instance.kernel_size, module_instance.kernel_size)\
+        if isinstance(module_instance.kernel_size, int) else module_instance.kernel_size
+    stride = (module_instance.stride, module_instance.stride) \
+        if isinstance(module_instance.stride, int) else module_instance.stride
+    dilation = (module_instance.dilation, module_instance.dilation)\
+        if isinstance(module_instance.dilation, int) else module_instance.dilation
+
     if d_in == Dyn:
         return Dyn
 
     elif isinstance(d_in, int):
-        n = d_in + 2 * module_instance.padding[index] - \
-            module_instance.dilation[index] * \
-            (module_instance.kernel_size[index] - 1) - 1
+        n = d_in + 2 * padding[index] - \
+            dilation[index] * \
+            (kernel_size[index] - 1) - 1
+
+        return (n // stride[0]) + 1
 
-        return (n // module_instance.stride[0]) + 1
     else:
         raise TypeError(f'{d_in} in {module_instance} must be a number or Dyn')
 
@@ -233,11 +245,11 @@ def get_greatest_upper_bound(type1, type2):
     elif type2 == Dyn:
         return type1
     elif isinstance(type1, TensorType) and isinstance(type2, TensorType):
-        assert is_consistent(type1, type2)
+        if not is_consistent(type1, type2):
+            raise TypeError(f'Inconsistent types {type1}, {type2}')
         gub = [t1 if is_more_precise(t1, t2) else t2 for (t1, t2) in zip(type1.__args__, type2.__args__)]
         return TensorType(tuple(gub))
-    else:
-        raise NotImplementedError(f'Greatest upper bound not yet implemented for these types {type1}, {type2}')
+
 
 @register_inference_rule(Conv2d)
 def conv2d_inference_rule(n: Node, module_instance):
@@ -262,6 +274,7 @@ def conv2d_inference_rule(n: Node, module_instance):
         new_type = TensorType((arg_type.__args__[0], module_instance.out_channels, h_out, w_out))
         gub = get_greatest_upper_bound(new_type, curr_node_type)
         n.type = gub
+
         return n.type
     else:
         raise TypeError(f'Cannot apply {module_instance} with input type { arg_type} and existing type {n.type} on {n}')
@@ -281,6 +294,43 @@ def relu_inference_rule(n: Node, module_instance):
         n.type = get_greatest_upper_bound(n.args[0].type, n.type)
     return n.type
 
+
+def maxpool2d_check(typ, module_instance):
+    new_type_list = list(typ.__args__)
+    if len(new_type_list) == 4 or len(new_type_list) == 3:
+        w_in = new_type_list[-1]
+        h_in = new_type_list[-2]
+        h_out = calculate(h_in, module_instance, 0)
+        w_out = calculate(w_in, module_instance, 1)
+        new_type_list[-1] = w_out
+        new_type_list[-2] = h_out
+        return TensorType(tuple(new_type_list))
+
+    else:
+        raise TypeError(f'Wrong size {typ} for {module_instance}')
+
+
+@register_inference_rule(torch.nn.MaxPool2d)
+def maxpool2d_inference_rule(n: Node, module_instance):
+    """
+    Given a MaxPool2D instance and a node check the following conditions:
+    - Input size matches size 3 or 4
+    - Current node type is consistent with the output type we will calculate
+    - Input size matches output size and the last two dimensions of the output
+      are w_out and h_out. The remaining dimensions are the same as the input
+    - Our final result is the greatest upper bound of the output we calculate
+      and the current node type.
+    """
+    assert isinstance(n.args[0], Node)
+
+    if n.args[0].type == Dyn and isinstance(n.type, TensorType):
+        n.args[0].type = expand_to_tensor_dim(n.args[0].type, len(n.type.__args__))
+    if isinstance(n.args[0].type, TensorType):
+        output = maxpool2d_check(n.args[0].type, module_instance)
+        n.type = get_greatest_upper_bound(output, n.type)
+    return n.type
+
+
 class GraphTypeChecker:
     def __init__(self, env, traced):
         self.env = env

From 93ef40bd833b7539c0d9ed89f769be17eaef22bc Mon Sep 17 00:00:00 2001
From: Zeina Migeed <migeedz@fb.com>
Date: Sat, 10 Jul 2021 20:04:51 -0700
Subject: [PATCH 077/122] add linear operation and modify one of the tests
 (#61238)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61238

Test Plan: Imported from OSS

Reviewed By: iramazanli

Differential Revision: D29626333

Pulled By: migeed-z

fbshipit-source-id: d4303918e380d64ba8ab678f249db6674e89357a
---
 test/fx/test_gradual_type.py                  | 10 ++++++-
 .../experimental/graph_gradual_typechecker.py | 29 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/test/fx/test_gradual_type.py b/test/fx/test_gradual_type.py
index a2066fc488d9b..fd3498dca8736 100644
--- a/test/fx/test_gradual_type.py
+++ b/test/fx/test_gradual_type.py
@@ -559,10 +559,15 @@ def __init__(self):
 
                 self.conv1 = torch.nn.Conv2d(3, 6, 5)
                 self.pool = torch.nn.MaxPool2d(2, 2)
+                self.conv2 = torch.nn.Conv2d(6, 16, 5)
+                self.fc1 = torch.nn.Linear(5, 120)
 
             def forward(self, x : TensorType((4, 3, 32, 32))):
                 out = self.conv1(x)
                 out = self.pool(out)
+                out = self.conv2(out)
+                out = self.pool(out)
+                out = self.fc1(out)
                 return out
 
         B = BasicBlock()
@@ -573,7 +578,10 @@ def forward(self, x : TensorType((4, 3, 32, 32))):
         tc.type_check()
 
         expected_ph_types = [TensorType((4, 3, 32, 32)), TensorType((4, 6, 28, 28)),
-                             TensorType((4, 6, 14, 14)), TensorType((4, 6, 14, 14))]
+                             TensorType((4, 6, 14, 14)), TensorType((4, 16, 10, 10)),
+                             TensorType((4, 16, 5, 5)), TensorType((4, 16, 5, 120)),
+                             TensorType((4, 16, 5, 120))]
+
         expected_iter = iter(expected_ph_types)
 
         for n in traced.graph.nodes:
diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py
index e7295745e4fa2..5ace7d7408132 100644
--- a/torch/fx/experimental/graph_gradual_typechecker.py
+++ b/torch/fx/experimental/graph_gradual_typechecker.py
@@ -331,6 +331,35 @@ def maxpool2d_inference_rule(n: Node, module_instance):
     return n.type
 
 
+
+def linear_check(tensor_type, module_instance):
+    """
+    Checks that an input tensor type satisfies the conditions for linear operation
+    and returns the output type based on in and out features given by module_instance
+    """
+    if len(tensor_type.__args__) >= 2:
+        if is_consistent(module_instance.in_features, tensor_type.__args__[-1]):
+            # Todo backwards propagation
+            new_type_args = list(tensor_type.__args__)
+            new_type_args[-1] = module_instance.out_features
+            return TensorType(tuple(new_type_args))
+        else:
+            raise TypeError(f'Inconsistent {module_instance.in_features} and {tensor_type.__args__[-1]} in {module_instance}')
+    else:
+        raise TypeError(f'Type {tensor_type} must have rank 2 or more.')
+
+
+@register_inference_rule(torch.nn.Linear)
+def linear_inference_rule(n: Node, module_instance):
+    assert isinstance(n.args[0], Node)
+    if n.args[0].type == Dyn and isinstance(n.type, TensorType):
+        n.args[0].type = expand_to_tensor_dim(n.args[0].type, len(n.type.__args__))
+    if isinstance(n.args[0].type, TensorType):
+        output_type = linear_check(n.args[0].type, module_instance)
+        n.type = get_greatest_upper_bound(output_type, n.type)
+    return n.type
+
+
 class GraphTypeChecker:
     def __init__(self, env, traced):
         self.env = env

From 666dff381d887eccfe6d1b0ee22cff65e79230d2 Mon Sep 17 00:00:00 2001
From: Zeina Migeed <migeedz@fb.com>
Date: Sat, 10 Jul 2021 20:04:51 -0700
Subject: [PATCH 078/122] add AdaptiveAvgPooling2D (#61239)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61239

Test Plan: Imported from OSS

Reviewed By: iramazanli

Differential Revision: D29626359

Pulled By: migeed-z

fbshipit-source-id: b7cd4ce4176e2d6e7a853974443affd23a49d3d9
---
 test/fx/test_gradual_type.py                  |  4 +-
 .../experimental/graph_gradual_typechecker.py | 38 +++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/test/fx/test_gradual_type.py b/test/fx/test_gradual_type.py
index fd3498dca8736..23e54e0819d9f 100644
--- a/test/fx/test_gradual_type.py
+++ b/test/fx/test_gradual_type.py
@@ -561,6 +561,7 @@ def __init__(self):
                 self.pool = torch.nn.MaxPool2d(2, 2)
                 self.conv2 = torch.nn.Conv2d(6, 16, 5)
                 self.fc1 = torch.nn.Linear(5, 120)
+                self.pool2 = torch.nn.AdaptiveAvgPool2d((6, 7))
 
             def forward(self, x : TensorType((4, 3, 32, 32))):
                 out = self.conv1(x)
@@ -568,6 +569,7 @@ def forward(self, x : TensorType((4, 3, 32, 32))):
                 out = self.conv2(out)
                 out = self.pool(out)
                 out = self.fc1(out)
+                out = self.pool2(out)
                 return out
 
         B = BasicBlock()
@@ -580,7 +582,7 @@ def forward(self, x : TensorType((4, 3, 32, 32))):
         expected_ph_types = [TensorType((4, 3, 32, 32)), TensorType((4, 6, 28, 28)),
                              TensorType((4, 6, 14, 14)), TensorType((4, 16, 10, 10)),
                              TensorType((4, 16, 5, 5)), TensorType((4, 16, 5, 120)),
-                             TensorType((4, 16, 5, 120))]
+                             TensorType((4, 16, 6, 7)), TensorType((4, 16, 6, 7))]
 
         expected_iter = iter(expected_ph_types)
 
diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py
index 5ace7d7408132..1c72d1a1a918d 100644
--- a/torch/fx/experimental/graph_gradual_typechecker.py
+++ b/torch/fx/experimental/graph_gradual_typechecker.py
@@ -360,6 +360,44 @@ def linear_inference_rule(n: Node, module_instance):
     return n.type
 
 
+
+def adaptiveavgpool2d_check(tensor_type, module_instance):
+    output_size = module_instance.output_size
+    if isinstance(output_size, int):
+        output_size = [output_size, output_size]
+    elif isinstance(output_size, tuple):
+        output_size = list(output_size)
+        if output_size[0] is None:
+            output_size[0] = output_size[1]
+        if output_size[1] is None:
+            output_size[1] = output_size[0]
+
+    new_type_list = list(tensor_type.__args__)
+
+    if len(tensor_type.__args__) == 4 or len(tensor_type.__args__) == 3:
+        new_type_list[-1] = output_size[1]
+        new_type_list[-2] = output_size[0]
+
+        return TensorType(tuple(new_type_list))
+
+    else:
+        raise TypeError(f'Tensor ranks must be 3 or 4. Got {tensor_type}')
+
+@register_inference_rule(torch.nn.AdaptiveAvgPool2d)
+def adaptiveavgpool2d_inference_rule(n: Node, module_instance):
+    """
+    The input and output sizes should be the same except for the last
+    two dimensions taken from the input, which represent width and height
+    """
+    assert isinstance(n.args[0], Node)
+    if n.args[0].type == Dyn and isinstance(n.type, TensorType):
+        n.args[0].type = expand_to_tensor_dim(n.args[0].type, len(n.type.__args__))
+    if isinstance(n.args[0].type, TensorType):
+        output_type = adaptiveavgpool2d_check(n.args[0].type, module_instance)
+        n.type = get_greatest_upper_bound(n.type, output_type)
+    return n.type
+
+
 class GraphTypeChecker:
     def __init__(self, env, traced):
         self.env = env

From 5c1505076bfa764088e2ccef19d7f18336084530 Mon Sep 17 00:00:00 2001
From: CodemodService Bot <>
Date: Mon, 12 Jul 2021 04:13:39 -0700
Subject: [PATCH 079/122] [Codemod][FBSourceBlackLinter] Daily `arc lint --take
 BLACK`

Reviewed By: zertosh

Differential Revision: D29656934

fbshipit-source-id: c40bbc8e4512b145050ee47db2c8dc781f3c36e9
---
 test/package/test_dependency_api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/package/test_dependency_api.py b/test/package/test_dependency_api.py
index d9ff435de57d5..83909fa0733c5 100644
--- a/test/package/test_dependency_api.py
+++ b/test/package/test_dependency_api.py
@@ -337,5 +337,6 @@ def test_repackage_mocked_module(self):
         with self.assertRaises(NotImplementedError):
             foo2.package_a.get_something()
 
+
 if __name__ == "__main__":
     run_tests()

From 60086ab39b96aa763412f40a7b2aaff06f701aaa Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Mon, 12 Jul 2021 06:58:51 -0700
Subject: [PATCH 080/122] Remove export PYTHONPATH hacks (#61487)

Summary:
Remove `export PYTHONPATH=$PWD` in favor of `-m`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61487

Test Plan: Let's see if CI passes

Reviewed By: 1ntEgr8

Differential Revision: D29645544

Pulled By: janeyx99

fbshipit-source-id: 841aea8ebed2cb1c7dbc68754b5fbdee932559c2
---
 .circleci/config.yml                                     | 9 +++------
 .../verbatim-sources/job-specs/job-specs-custom.yml      | 3 +--
 .../verbatim-sources/job-specs/pytorch-job-specs.yml     | 6 ++----
 .github/templates/bazel_ci_workflow.yml.j2               | 6 ++----
 .github/templates/linux_ci_workflow.yml.j2               | 6 ++----
 .github/templates/windows_ci_workflow.yml.j2             | 3 +--
 .github/workflows/build_linux_conda.yml                  | 3 +--
 .github/workflows/build_linux_libtorch.yml               | 3 +--
 .github/workflows/build_linux_wheels.yml                 | 3 +--
 .../periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml    | 3 +--
 .../pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7.yml  | 6 ++----
 .../pytorch-linux-bionic-py3.8-gcc9-coverage.yml         | 6 ++----
 .../pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7.yml  | 6 ++----
 .../pytorch-linux-xenial-cuda11.1-cudnn8-py3.6-gcc7.yml  | 6 ++----
 .github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml  | 6 ++----
 .../pytorch-linux-xenial-py3.6-gcc7-bazel-test.yml       | 6 ++----
 .github/workflows/pytorch-win-vs2019-cpu-py3.yml         | 3 +--
 .../workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml   | 3 +--
 .../workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml   | 3 +--
 19 files changed, 30 insertions(+), 60 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e0f5eed8a03e6..b5d7c303d383d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -689,8 +689,7 @@ jobs:
           export JOB_BASE_NAME="$CIRCLE_JOB"
           export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"
           cd workspace
-          export PYTHONPATH="\${PWD}"
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
           EOL
           echo "(cat docker_commands.sh | docker exec -u jenkins -e LANG=C.UTF-8 -i "$id" bash) 2>&1" > command.sh
           unbuffer bash command.sh | ts
@@ -842,9 +841,8 @@ jobs:
             set -ex
             export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_WIN_BUILD_V1}
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_WIN_BUILD_V1}
-            export PYTHONPATH="$PWD"
             pip install typing_extensions boto3
-            python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+            python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
           when: always
       - store_test_results:
           path: test/test-reports
@@ -1454,12 +1452,11 @@ jobs:
             set -ex
             source /Users/distiller/workspace/miniconda3/bin/activate
             pip install boto3
-            export PYTHONPATH="$PWD"
 
             # Using the same IAM user to write stats to our OSS bucket
             export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4}
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
-            python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+            python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
           when: always
       - store_test_results:
           path: test/test-reports
diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
index 80d14fcda2d9a..47420a2922c2c 100644
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@@ -208,12 +208,11 @@
             set -ex
             source /Users/distiller/workspace/miniconda3/bin/activate
             pip install boto3
-            export PYTHONPATH="$PWD"
 
             # Using the same IAM user to write stats to our OSS bucket
             export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4}
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
-            python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+            python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
           when: always
       - store_test_results:
           path: test/test-reports
diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
index b25e0f3fd0d48..7a92af8b5200a 100644
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -223,8 +223,7 @@ jobs:
           export JOB_BASE_NAME="$CIRCLE_JOB"
           export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"
           cd workspace
-          export PYTHONPATH="\${PWD}"
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
           EOL
           echo "(cat docker_commands.sh | docker exec -u jenkins -e LANG=C.UTF-8 -i "$id" bash) 2>&1" > command.sh
           unbuffer bash command.sh | ts
@@ -376,9 +375,8 @@ jobs:
             set -ex
             export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_WIN_BUILD_V1}
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_WIN_BUILD_V1}
-            export PYTHONPATH="$PWD"
             pip install typing_extensions boto3
-            python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+            python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
           when: always
       - store_test_results:
           path: test/test-reports
diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2
index aa6e7eade777b..c147dea2afd4b 100644
--- a/.github/templates/bazel_ci_workflow.yml.j2
+++ b/.github/templates/bazel_ci_workflow.yml.j2
@@ -81,11 +81,10 @@ name: Bazel Linux CI (!{{ build_environment }})
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
           COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
           export COMMIT_TIME
           pip3 install requests
-          python3 .circleci/scripts/upload_binary_size_to_scuba.py || exit 0
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
       - name: Test PyTorch
         run: |
           export SHARD_NUMBER=0
@@ -194,6 +193,5 @@ name: Bazel Linux CI (!{{ build_environment }})
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
 {%- endblock %}
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 1fcac6b7e5cb1..92dcbf4e8cbec 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -162,11 +162,10 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
           COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
           export COMMIT_TIME
           pip3 install requests
-          python3 tools/stats/upload_binary_size_to_scuba.py || exit 0
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
       - name: Chown workspace
         run: |
           # Ensure the working directory gets chowned back to the current user
@@ -421,8 +420,7 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
 {%- endblock %}
   {%- if enable_doc_jobs %}
 
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index bfd28439f9b86..47234ccb986c7 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -263,5 +263,4 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
diff --git a/.github/workflows/build_linux_conda.yml b/.github/workflows/build_linux_conda.yml
index db8fe1079de04..9ceff866b94c8 100644
--- a/.github/workflows/build_linux_conda.yml
+++ b/.github/workflows/build_linux_conda.yml
@@ -102,11 +102,10 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
           COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
           export COMMIT_TIME
           pip3 install requests
-          python3 tools/stats/upload_binary_size_to_scuba.py || exit 0
+          python3 tools.stats.upload_binary_size_to_scuba || exit 0
 
 concurrency:
   group: build-linux-conda-${{ github.event.pull_request.number || github.sha }}
diff --git a/.github/workflows/build_linux_libtorch.yml b/.github/workflows/build_linux_libtorch.yml
index c2cef1a245cb6..b363ff3a8a280 100644
--- a/.github/workflows/build_linux_libtorch.yml
+++ b/.github/workflows/build_linux_libtorch.yml
@@ -101,11 +101,10 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
           COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
           export COMMIT_TIME
           pip3 install requests
-          python3 tools/stats/upload_binary_size_to_scuba.py || exit 0
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
 
 concurrency:
   group: build-linux-libtorch-${{ github.event.pull_request.number || github.sha }}
diff --git a/.github/workflows/build_linux_wheels.yml b/.github/workflows/build_linux_wheels.yml
index 1fc39c0ae48fa..b536dd4562d54 100644
--- a/.github/workflows/build_linux_wheels.yml
+++ b/.github/workflows/build_linux_wheels.yml
@@ -100,11 +100,10 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
           COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
           export COMMIT_TIME
           pip3 install requests
-          python3 tools/stats/upload_binary_size_to_scuba.py || exit 0
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
 
 concurrency:
   group: build-linux-wheels-${{ github.event.pull_request.number || github.sha }}
diff --git a/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml b/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml
index 223a8894268d6..3af28a3bce87e 100644
--- a/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml
+++ b/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml
@@ -237,5 +237,4 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
diff --git a/.github/workflows/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7.yml b/.github/workflows/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7.yml
index ae5cd95da46de..330fcbb663a9a 100644
--- a/.github/workflows/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7.yml
+++ b/.github/workflows/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7.yml
@@ -152,11 +152,10 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
           COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
           export COMMIT_TIME
           pip3 install requests
-          python3 tools/stats/upload_binary_size_to_scuba.py || exit 0
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
       - name: Chown workspace
         run: |
           # Ensure the working directory gets chowned back to the current user
@@ -407,5 +406,4 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
diff --git a/.github/workflows/pytorch-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/pytorch-linux-bionic-py3.8-gcc9-coverage.yml
index 46f51ad09f325..1a37ee46c1c4f 100644
--- a/.github/workflows/pytorch-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/pytorch-linux-bionic-py3.8-gcc9-coverage.yml
@@ -153,11 +153,10 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
           COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
           export COMMIT_TIME
           pip3 install requests
-          python3 tools/stats/upload_binary_size_to_scuba.py || exit 0
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
       - name: Chown workspace
         run: |
           # Ensure the working directory gets chowned back to the current user
@@ -408,5 +407,4 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
diff --git a/.github/workflows/pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7.yml b/.github/workflows/pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7.yml
index 0ca67850462ba..7b921ecc6dd62 100644
--- a/.github/workflows/pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7.yml
+++ b/.github/workflows/pytorch-linux-xenial-cuda10.2-cudnn7-py3.6-gcc7.yml
@@ -152,11 +152,10 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
           COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
           export COMMIT_TIME
           pip3 install requests
-          python3 tools/stats/upload_binary_size_to_scuba.py || exit 0
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
       - name: Chown workspace
         run: |
           # Ensure the working directory gets chowned back to the current user
@@ -407,5 +406,4 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
diff --git a/.github/workflows/pytorch-linux-xenial-cuda11.1-cudnn8-py3.6-gcc7.yml b/.github/workflows/pytorch-linux-xenial-cuda11.1-cudnn8-py3.6-gcc7.yml
index f4f6e60f4e04d..ee47ee29ff6e3 100644
--- a/.github/workflows/pytorch-linux-xenial-cuda11.1-cudnn8-py3.6-gcc7.yml
+++ b/.github/workflows/pytorch-linux-xenial-cuda11.1-cudnn8-py3.6-gcc7.yml
@@ -152,11 +152,10 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
           COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
           export COMMIT_TIME
           pip3 install requests
-          python3 tools/stats/upload_binary_size_to_scuba.py || exit 0
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
       - name: Chown workspace
         run: |
           # Ensure the working directory gets chowned back to the current user
@@ -407,5 +406,4 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
diff --git a/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
index 49720b1e4ec11..438c506bbeb0b 100644
--- a/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
@@ -153,11 +153,10 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
           COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
           export COMMIT_TIME
           pip3 install requests
-          python3 tools/stats/upload_binary_size_to_scuba.py || exit 0
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
       - name: Chown workspace
         run: |
           # Ensure the working directory gets chowned back to the current user
@@ -408,8 +407,7 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
 
   pytorch_python_doc_build:
     runs-on: linux.2xlarge
diff --git a/.github/workflows/pytorch-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/pytorch-linux-xenial-py3.6-gcc7-bazel-test.yml
index 0c9aef0c1e6b0..842ee9cd7daf5 100644
--- a/.github/workflows/pytorch-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/pytorch-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -170,11 +170,10 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
           COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
           export COMMIT_TIME
           pip3 install requests
-          python3 .circleci/scripts/upload_binary_size_to_scuba.py || exit 0
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
       - name: Test PyTorch
         run: |
           export SHARD_NUMBER=0
@@ -281,5 +280,4 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
diff --git a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
index df17edc1cd753..753c053304f61 100644
--- a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
@@ -222,5 +222,4 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
diff --git a/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml b/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
index 472e05bbe06ad..ef6e69caff82a 100644
--- a/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
@@ -240,5 +240,4 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
diff --git a/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml b/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
index 3c906a8d7d9d6..7580ba180d886 100644
--- a/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
@@ -239,5 +239,4 @@ jobs:
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
         run: |
-          export PYTHONPATH=$PWD
-          python tools/stats/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test

From 09679af260b53c44c25b87750a84c1c50b6e9760 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Mon, 12 Jul 2021 07:09:11 -0700
Subject: [PATCH 081/122] Delete dead code in Tensor::to implementation
 (#61435)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61435

Deleted the following:
- I couldn't find the NOTE mentioned so I deleted the reference to it
- The memory_format check (because it always passes)
- The requires_grad check (because it always passes)

Test Plan: - run tests

Reviewed By: soulitzer

Differential Revision: D29636872

Pulled By: zou3519

fbshipit-source-id: 48a32c1821b72c512d337becf2398ce7f4cf01a2
---
 aten/src/ATen/native/TensorConversions.cpp | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index 638f588ffd6ef..4c199ecae56aa 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -67,18 +67,12 @@ Tensor to(
   bool copy,
   c10::optional<c10::MemoryFormat> optional_memory_format
 ) {
-  // See [Note: hacky wrapper removal for TensorOptions]
-  TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
-
-  TORCH_CHECK(
-    !(options_.has_memory_format() && optional_memory_format.has_value()),
-    "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
-    "the redundant setter.");
-  auto options = options_.merge_memory_format(optional_memory_format);
-
-  TORCH_CHECK(options.requires_grad_opt() == c10::nullopt,
-           "to(options) expects unset requires_grad flag, but got "
-           "options.requires_grad set as ", options.requires_grad());
+  TensorOptions options = TensorOptions()
+      .dtype(dtype)
+      .layout(layout)
+      .device(device)
+      .pinned_memory(pin_memory)
+      .memory_format(optional_memory_format);
 
   TORCH_CHECK(!options.has_layout() || self.layout() == options.layout(),
            "to(options) doesn't support converting to a different layout, "

From ac086ca15b3605925847059ea7919c8ae50dfa31 Mon Sep 17 00:00:00 2001
From: mdmn07C5 <malicdan@protonmail.ch>
Date: Mon, 12 Jul 2021 07:28:53 -0700
Subject: [PATCH 082/122] Update version.txt file path (#61177)

Summary:
The file version.txt is located one directory above generate_torch_version,
some platforms are unable to find this file unless given an explicit
path.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61177

Reviewed By: pbelevich

Differential Revision: D29660334

Pulled By: ezyang

fbshipit-source-id: f66105f782aaff031e373f96a69baabb13c89337
---
 tools/generate_torch_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/generate_torch_version.py b/tools/generate_torch_version.py
index 61682c9c89634..2ee17b76e52f4 100644
--- a/tools/generate_torch_version.py
+++ b/tools/generate_torch_version.py
@@ -13,7 +13,7 @@ def get_sha(pytorch_root: Union[str, Path]) -> str:
 
 def get_torch_version(sha: Optional[str] = None) -> str:
     pytorch_root = Path(__file__).parent.parent
-    version = open('version.txt', 'r').read().strip()
+    version = open(pytorch_root / 'version.txt', 'r').read().strip()
 
     if os.getenv('PYTORCH_BUILD_VERSION'):
         assert os.getenv('PYTORCH_BUILD_NUMBER') is not None

From 00c4897c51749ce355d86fa80f833df2818c83d7 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Mon, 12 Jul 2021 08:08:09 -0700
Subject: [PATCH 083/122] use make_unique (#61272)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61272

Reviewed By: pbelevich

Differential Revision: D29660354

Pulled By: ezyang

fbshipit-source-id: f0aba1ea6983aec415915ed9b7dbced2e2b3b171
---
 c10/cuda/CUDACachingAllocator.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index b48f75560f58b..0007ebd94106a 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -799,8 +799,7 @@ class DeviceCachingAllocator {
     if (it == graph_pools.end()) {
       // mempool_id does not reference an existing pool. Make a new pool for
       // this capture.
-      graph_pools.emplace(std::make_pair(
-          mempool_id, std::unique_ptr<PrivatePool>(new PrivatePool)));
+      graph_pools.emplace(mempool_id, std::make_unique<PrivatePool>());
     } else {
       // mempool_id references an existing pool, which the current capture will
       // share. Check this pool is live (at least one other capture already
@@ -1339,8 +1338,7 @@ class THCCachingAllocator {
     if (size < device_count) {
       device_allocator.resize(device_count);
       for (const auto i : c10::irange(size, device_count)) {
-        device_allocator[i] = std::unique_ptr<DeviceCachingAllocator>(
-            new DeviceCachingAllocator());
+        device_allocator[i] = std::make_unique<DeviceCachingAllocator>();
       }
     }
   }

From d4549ba5dc7146dfa3e52993d6548fbdfd1256b0 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Mon, 12 Jul 2021 08:19:29 -0700
Subject: [PATCH 084/122] Add VS_VERSION to Circle (#61532)

Summary:
Fixes current HUD 10.1 failure https://app.circleci.com/pipelines/github/pytorch/pytorch/349359/workflows/ead2904b-3f37-4c9d-b271-a8e772046523/jobs/14713215

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61532

Test Plan: The new 10.1 CI run: https://app.circleci.com/pipelines/github/pytorch/pytorch/349677/workflows/b7143b56-e8e7-4f85-8bdf-0ce50788f3c0/jobs/14727686

Reviewed By: walterddr

Differential Revision: D29661179

Pulled By: janeyx99

fbshipit-source-id: 5023c41fe6ddce4113116b07d8f0fd7d66c864a8
---
 .../cimodel/data/windows_build_definitions.py   |  1 +
 .circleci/config.yml                            | 17 +++++++++++++++++
 .circleci/scripts/vs_install.ps1                |  2 +-
 .../build-parameters/pytorch-build-params.yml   |  4 ++++
 .../job-specs/pytorch-job-specs.yml             |  6 ++++++
 .../workflows/workflows-scheduled-ci.yml        |  3 +++
 6 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/.circleci/cimodel/data/windows_build_definitions.py b/.circleci/cimodel/data/windows_build_definitions.py
index 173e669cc68d5..b02104ecc2a08 100644
--- a/.circleci/cimodel/data/windows_build_definitions.py
+++ b/.circleci/cimodel/data/windows_build_definitions.py
@@ -78,6 +78,7 @@ def gen_tree(self):
             props_dict = {
                 "build_environment": build_environment_string,
                 "python_version": miniutils.quote(python_version),
+                "vs_version": miniutils.quote("16.8.6"),
                 "vc_version": miniutils.quote(self.vscode_spec.dotted_version()),
                 "vc_year": miniutils.quote(str(self.vscode_spec.year)),
                 "vc_product": self.vscode_spec.get_product(),
diff --git a/.circleci/config.yml b/.circleci/config.yml
index b5d7c303d383d..e24fbebc3d558 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -357,6 +357,9 @@ pytorch_windows_params: &pytorch_windows_params
     python_version:
       type: string
       default: "3.8"
+    vs_version:
+      type: string
+      default: "16.8.6"
     vc_version:
       type: string
       default: "14.16"
@@ -374,6 +377,7 @@ pytorch_windows_params: &pytorch_windows_params
     SCCACHE_BUCKET: "ossci-compiler-cache"
     CUDA_VERSION: <<parameters.cuda_version>>
     PYTHON_VERSION: <<parameters.python_version>>
+    VS_VERSION: <<parameters.vs_version>>
     VC_VERSION: <<parameters.vc_version>>
     VC_YEAR: <<parameters.vc_year>>
     VC_PRODUCT: <<parameters.vc_product>>
@@ -719,6 +723,9 @@ jobs:
       python_version:
         type: string
         default: "3.8"
+      vs_version:
+        type: string
+        default: "16.8.6"
       vc_version:
         type: string
         default: "14.16"
@@ -786,6 +793,9 @@ jobs:
       python_version:
         type: string
         default: "3.8"
+      vs_version:
+        type: string
+        default: "16.8.6"
       vc_version:
         type: string
         default: "14.16"
@@ -7621,6 +7631,7 @@ workflows:
           vc_product: BuildTools
           vc_version: ""
           vc_year: "2019"
+          vs_version: "16.8.6"
       - pytorch_windows_test:
           build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
           cuda_version: "10.1"
@@ -7639,6 +7650,7 @@ workflows:
           vc_product: BuildTools
           vc_version: ""
           vc_year: "2019"
+          vs_version: "16.8.6"
       - update_s3_htmls:
           context: org-member
           filters:
@@ -9179,6 +9191,7 @@ workflows:
           vc_product: BuildTools
           vc_version: ""
           vc_year: "2019"
+          vs_version: "16.8.6"
       - pytorch_windows_test:
           build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
           cuda_version: "10.1"
@@ -9191,6 +9204,7 @@ workflows:
           vc_product: BuildTools
           vc_version: ""
           vc_year: "2019"
+          vs_version: "16.8.6"
     when: << pipeline.parameters.run_master_build >>
   slow_gradcheck_build:
     jobs:
@@ -9304,6 +9318,7 @@ workflows:
           python_version: "3.8"
           use_cuda: "1"
           vc_product: BuildTools
+          vs_version: "16.8.6"
           vc_version: "14.28.29333"
           vc_year: "2019"
           filters:
@@ -9322,6 +9337,7 @@ workflows:
           test_name: pytorch-windows-test1
           use_cuda: "1"
           vc_product: BuildTools
+          vs_version: "16.8.6"
           vc_version: "14.28.29333"
           vc_year: "2019"
           filters:
@@ -9340,6 +9356,7 @@ workflows:
           test_name: pytorch-windows-test2
           use_cuda: "1"
           vc_product: BuildTools
+          vs_version: "16.8.6"
           vc_version: "14.28.29333"
           vc_year: "2019"
           filters:
diff --git a/.circleci/scripts/vs_install.ps1 b/.circleci/scripts/vs_install.ps1
index 8b3886347531c..a2e373078adb6 100644
--- a/.circleci/scripts/vs_install.ps1
+++ b/.circleci/scripts/vs_install.ps1
@@ -21,7 +21,7 @@ if (${env:INSTALL_WINDOWS_SDK} -eq "1") {
 if (Test-Path "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe") {
     $VS_VERSION_major = [int] ${env:VS_VERSION}.split(".")[0]
     $existingPath = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -products "Microsoft.VisualStudio.Product.BuildTools" -version "[${env:VS_VERSION}, ${env:VS_VERSION_major + 1})" -property installationPath
-    if ($existingPath -ne $null) {
+    if (($existingPath -ne $null) -and (!${env:CIRCLECI})) {
         echo "Found correctly versioned existing BuildTools installation in $existingPath"
         exit 0
     }
diff --git a/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml b/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml
index 5bc160f0c3523..6e6993245843f 100644
--- a/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml
+++ b/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml
@@ -89,6 +89,9 @@ pytorch_windows_params: &pytorch_windows_params
     python_version:
       type: string
       default: "3.8"
+    vs_version:
+      type: string
+      default: "16.8.6"
     vc_version:
       type: string
       default: "14.16"
@@ -106,6 +109,7 @@ pytorch_windows_params: &pytorch_windows_params
     SCCACHE_BUCKET: "ossci-compiler-cache"
     CUDA_VERSION: <<parameters.cuda_version>>
     PYTHON_VERSION: <<parameters.python_version>>
+    VS_VERSION: <<parameters.vs_version>>
     VC_VERSION: <<parameters.vc_version>>
     VC_YEAR: <<parameters.vc_year>>
     VC_PRODUCT: <<parameters.vc_product>>
diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
index 7a92af8b5200a..9bf9bc0ff1897 100644
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -253,6 +253,9 @@ jobs:
       python_version:
         type: string
         default: "3.8"
+      vs_version:
+        type: string
+        default: "16.8.6"
       vc_version:
         type: string
         default: "14.16"
@@ -320,6 +323,9 @@ jobs:
       python_version:
         type: string
         default: "3.8"
+      vs_version:
+        type: string
+        default: "16.8.6"
       vc_version:
         type: string
         default: "14.16"
diff --git a/.circleci/verbatim-sources/workflows/workflows-scheduled-ci.yml b/.circleci/verbatim-sources/workflows/workflows-scheduled-ci.yml
index 307248729250b..becd8cb020427 100644
--- a/.circleci/verbatim-sources/workflows/workflows-scheduled-ci.yml
+++ b/.circleci/verbatim-sources/workflows/workflows-scheduled-ci.yml
@@ -85,6 +85,7 @@
           python_version: "3.8"
           use_cuda: "1"
           vc_product: BuildTools
+          vs_version: "16.8.6"
           vc_version: "14.28.29333"
           vc_year: "2019"
           filters:
@@ -103,6 +104,7 @@
           test_name: pytorch-windows-test1
           use_cuda: "1"
           vc_product: BuildTools
+          vs_version: "16.8.6"
           vc_version: "14.28.29333"
           vc_year: "2019"
           filters:
@@ -121,6 +123,7 @@
           test_name: pytorch-windows-test2
           use_cuda: "1"
           vc_product: BuildTools
+          vs_version: "16.8.6"
           vc_version: "14.28.29333"
           vc_year: "2019"
           filters:

From 0de35fe039f843f6bb390cd00a0e1f8550ab9317 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Mon, 12 Jul 2021 08:22:07 -0700
Subject: [PATCH 085/122] fix return local reference (#59913)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/59913

Reviewed By: soulitzer

Differential Revision: D29107110

Pulled By: ezyang

fbshipit-source-id: c0f9888867c7dfeb05f6a3b9d2067df35e1e3ffb
---
 aten/src/ATen/native/TensorShape.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 22e04ec3cbd2e..7c6c804db5331 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -137,7 +137,7 @@ Tensor & _cat_out_cpu(TensorList tensors, int64_t dim, Tensor& result) {
   }
   at::assert_no_internal_overlap(result);
 
-  const Tensor* pnotSkippedTensor = [](TensorList tensors) -> const Tensor* {
+  const Tensor* pnotSkippedTensor = [](const TensorList &tensors) -> const Tensor* {
     for (auto const &tensor : tensors) {
       if (should_skip(tensor)) {
         continue;

From 93d98ecef7daf1c2b0e00af458c78c5d65910d1c Mon Sep 17 00:00:00 2001
From: Antonio Cuni <anto.cuni@gmail.com>
Date: Mon, 12 Jul 2021 08:56:03 -0700
Subject: [PATCH 086/122] update the pytorch-gdb example so that it works on
 current master (#61175)

Summary:
As pointed out by https://github.com/pytorch/pytorch/pull/54339#issuecomment-872827580, the `pytorch-gdb` example is currently broken because the code has been refactored.

This PR updates the example so that it works again.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61175

Reviewed By: heitorschueroff

Differential Revision: D29660336

Pulled By: ezyang

fbshipit-source-id: 8bcd32fc583c0b28a705ef37203ce7ad4d636732
---
 CONTRIBUTING.md | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 010c21cc4ba73..87195362f2fd9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -852,13 +852,13 @@ object. Example of usage:
 
 ```
 $ gdb python
-GNU gdb (Ubuntu 9.2-0ubuntu1~20.04) 9.2
+GNU gdb (GDB) 9.2
 [...]
 (gdb) # insert a breakpoint when we call .neg()
-(gdb) break at::native:neg
-No source file named at::native.
+(gdb) break at::Tensor::neg
+Function "at::Tensor::neg" not defined.
 Make breakpoint pending on future shared library load? (y or [n]) y
-Breakpoint 1 (at::native:neg) pending.
+Breakpoint 1 (at::Tensor::neg) pending.
 
 (gdb) run
 [...]
@@ -868,13 +868,15 @@ Breakpoint 1 (at::native:neg) pending.
 tensor([1., 2., 3., 4.], dtype=torch.float64)
 >>> t.neg()
 
-Breakpoint 1, at::native::neg (self=...) at [...]/pytorch/aten/src/ATen/native/UnaryOps.cpp:520
-520     Tensor neg(const Tensor& self) { return unary_op_impl(self, at::neg_out); }
-(gdb) # the default repr of 'self' is not very useful
-(gdb) p self
-$1 = (const at::Tensor &) @0x7ffff72ed780: {impl_ = {target_ = 0x5555559df6e0}}
-(gdb) torch-tensor-repr self
-Python-level repr of self:
+Thread 1 "python" hit Breakpoint 1, at::Tensor::neg (this=0x7ffb118a9c88) at aten/src/ATen/core/TensorBody.h:3295
+3295    inline at::Tensor Tensor::neg() const {
+(gdb) # the default repr of 'this' is not very useful
+(gdb) p this
+$1 = (const at::Tensor * const) 0x7ffb118a9c88
+(gdb) p *this
+$2 = {impl_ = {target_ = 0x55629b5cd330}}
+(gdb) torch-tensor-repr *this
+Python-level repr of *this:
 tensor([1., 2., 3., 4.], dtype=torch.float64)
 ```
 

From 8754238410ae2f383dc0f970e24f8916ab30c2a1 Mon Sep 17 00:00:00 2001
From: hauntsaninja <>
Date: Mon, 12 Jul 2021 08:56:54 -0700
Subject: [PATCH 087/122] torch._utils.ExceptionWrapper: fix for Exceptions
 with multiple args (#58131)

Summary:
Here's an example of what this PR should fix:
```
from torch._utils import ExceptionWrapper

class TwoArgException(Exception):
    def __init__(self, msg, count): ...

# If you need a "real world" exception with two args, here's one from the stdlib:
# import asyncio
# TwoArgException = asyncio.exceptions.LimitOverrunError
# or if on Python 3.7, try:
# TwoArgException = asyncio.streams.LimitOverrunError

try:
    raise TwoArgException("oh no", 0)
except Exception as e:
    data = ExceptionWrapper(where="in a test case")

data.reraise()
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/58131

Reviewed By: heitorschueroff

Differential Revision: D29660248

Pulled By: ezyang

fbshipit-source-id: cbcecfee9cac183354542e147ee3d956038c8986
---
 torch/_utils.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/torch/_utils.py b/torch/_utils.py
index 55d4fbc56de8d..1500e6ead84fd 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -422,7 +422,13 @@ def reraise(self):
             # Some exceptions have first argument as non-str but explicitly
             # have message field
             raise self.exc_type(message=msg)
-        raise self.exc_type(msg)
+        try:
+            exception = self.exc_type(msg)
+        except TypeError:
+            # If the exception takes multiple arguments, don't try to
+            # instantiate since we don't know how to
+            raise RuntimeError(msg) from None
+        raise exception
 
 
 def _get_available_device_type():

From 7136a62b561a767ee875fd6ac1bedbd10d667974 Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Mon, 12 Jul 2021 08:57:38 -0700
Subject: [PATCH 088/122] Add `expecttest` to CONTRIBUTING.md (#61163)

Summary:
Now expecttest is an independent library but `CONTRIBUTING.md` and `requirements.txt` do not mention the need of the library.

Related: https://github.com/pytorch/pytorch/pull/60658

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61163

Reviewed By: heitorschueroff

Differential Revision: D29660296

Pulled By: ezyang

fbshipit-source-id: e2e86d42526c83bec7cdf7221e19fe83d9686103
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 87195362f2fd9..0d72537981c3a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -348,7 +348,7 @@ in `test/test_jit.py`. Your command would be:
 python test/test_jit.py TestJit.test_Sequential
 ```
 
-The `hypothesis` library must be installed to run the tests. `mypy` is
+The `expecttest` and `hypothesis` libraries must be installed to run the tests. `mypy` is
 an optional dependency, and `pytest` may help run tests more selectively.
 All these packages can be installed with `conda` or `pip`.
 

From bacf8ecbd16567de7f7bcf126ab22d62cf01f8a3 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 12 Jul 2021 09:09:57 -0700
Subject: [PATCH 089/122] Make pin_memory/is_pinned use BackendSelect (#60547)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/60547

These now dispatch on the optional Device argument, which specifies
what device you want to pin for.  We now directly register pinned
memory implementations for CUDA specifically, eliminating the need
for extra virtual methods.

This makes it possible for other backends to override the behavior
of pinned memory, c.f. https://github.com/pytorch/pytorch/pull/59291

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: albanD, bdhirsh

Differential Revision: D29331881

Pulled By: ezyang

fbshipit-source-id: db3b4e2c872ba1caa0243fecc60a4da65179ce28
---
 aten/src/ATen/cuda/PinnedMemoryAllocator.cpp  | 37 +++++++++++++++-
 aten/src/ATen/native/Memory.cpp               | 43 +++++++------------
 aten/src/ATen/native/native_functions.yaml    | 10 +++++
 .../ATen/templates/RegisterBackendSelect.cpp  | 24 +++++++++++
 aten/src/ATen/test/basic.cpp                  |  8 +---
 tools/autograd/derivatives.yaml               |  3 ++
 6 files changed, 89 insertions(+), 36 deletions(-)

diff --git a/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp b/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp
index 3e6c683d8ce4e..4c65cb9223657 100644
--- a/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp
+++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp
@@ -1,17 +1,50 @@
 #include <ATen/cuda/PinnedMemoryAllocator.h>
 #include <ATen/Context.h>
 #include <ATen/Config.h>
+#include <ATen/TensorUtils.h>
+#include <c10/core/Storage.h>
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/CPUFunctions.h>
 
 #include <THC/THC.h>
 #include <THC/THCGeneral.hpp>
 
 #include <stdexcept>
 
-namespace at { namespace cuda {
+namespace at {
+
+namespace cuda {
 
 at::Allocator* getPinnedMemoryAllocator() {
   auto state = globalContext().lazyInitCUDA();
   return state->cudaHostAllocator;
 }
 
-}} // namespace at::cuda
+} // namespace cuda
+
+namespace native {
+
+bool is_pinned_cuda(const Tensor& self, c10::optional<Device> device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda());
+  // TODO: unhook this
+  return detail::getCUDAHooks().isPinnedPtr(self.storage().data());
+}
+
+Tensor _pin_memory_cuda(const Tensor& self, c10::optional<Device> device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda());
+  auto* allocator = at::cuda::getPinnedMemoryAllocator();
+  auto storage = Storage(
+      Storage::use_byte_size_t(),
+      detail::computeStorageNbytes(
+          self.sizes(), self.strides(), self.dtype().itemsize()),
+      allocator,
+      /*resizable=*/false);
+  auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
+  tensor.copy_(self);
+  return tensor;
+}
+
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp
index b16557ba9e0c1..df6949b2d7d95 100644
--- a/aten/src/ATen/native/Memory.cpp
+++ b/aten/src/ATen/native/Memory.cpp
@@ -1,42 +1,29 @@
 #include <ATen/ATen.h>
 #include <ATen/MemoryOverlap.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/detail/CUDAHooksInterface.h>
-#include <c10/util/Exception.h>
-#include <c10/core/Storage.h>
 
 namespace at {
 namespace native {
 
-bool is_pinned(const Tensor& self, c10::optional<Device> device) {
-  TORCH_CHECK(!device.has_value() || device->is_cuda(), "non-cuda device doesn't have a concept of is_pinned");
-  return detail::getCUDAHooks().isPinnedPtr(self.storage().data());
+// Exposes at::has_internal_overlap as an operator for testing purposes
+int64_t _debug_has_internal_overlap(const Tensor& self) {
+  return static_cast<int64_t>(at::has_internal_overlap(self));
+}
+
+// Technically, we could force backends to explicitly say "no, we don't support
+// pinned memory, always return false", but this makes life a little easier when
+// you haven't loaded the backend extension at all (which can happen, e.g., on a
+// CPU build of PyTorch and you try to check if something is CUDA pinned)
+bool is_pinned_default(const Tensor& self, c10::optional<Device> device) {
+  return false;
 }
 
 Tensor pin_memory(const Tensor& self, c10::optional<Device> device) {
-  if (!self.device().is_cpu()) {
-    AT_ERROR("cannot pin '", self.toString(), "' only dense CPU tensors can be pinned");
-  }
-  TORCH_CHECK(!device.has_value() || device->is_cuda(), "non-cuda device doesn't have a concept of pinned memory");
-  if (self.is_pinned()) {
+  // Kind of mad that I have to do two dynamic dispatches here, pretty
+  // annoying
+  if (self.is_pinned(device)) {
     return self;
   }
-  auto* allocator = detail::getCUDAHooks().getPinnedMemoryAllocator();
-  auto storage = Storage(
-      Storage::use_byte_size_t(),
-      detail::computeStorageNbytes(
-          self.sizes(), self.strides(), self.dtype().itemsize()),
-      allocator,
-      /*resizable=*/false);
-  auto tensor = at::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
-  tensor.copy_(self);
-  return tensor;
-}
-
-// Exposes at::has_internal_overlap as an operator for testing purposes
-int64_t _debug_has_internal_overlap(const Tensor& self) {
-  return static_cast<int64_t>(at::has_internal_overlap(self));
+  return at::_pin_memory(self, device);
 }
 
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index bb3e027fe0123..b3743a4be1150 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3236,10 +3236,20 @@
 
 - func: is_pinned(Tensor self, Device? device=None) -> bool
   variants: method
+  dispatch:
+    CUDA: is_pinned_cuda
+    CompositeExplicitAutograd: is_pinned_default
 
+# TODO: add a copy kwarg that guarantees that the tensor is put into fresh
+# pinned memory
 - func: pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a)
   variants: method
 
+# Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor
+- func: _pin_memory(Tensor self, Device? device=None) -> Tensor
+  dispatch:
+    CUDA: _pin_memory_cuda
+
 - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor
   variants: function, method
 
diff --git a/aten/src/ATen/templates/RegisterBackendSelect.cpp b/aten/src/ATen/templates/RegisterBackendSelect.cpp
index 27b8e2bcc5125..91fa8ee40334d 100644
--- a/aten/src/ATen/templates/RegisterBackendSelect.cpp
+++ b/aten/src/ATen/templates/RegisterBackendSelect.cpp
@@ -16,8 +16,32 @@ namespace {
 
 ${backend_select_method_definitions}
 
+bool is_pinned(const Tensor& self, c10::optional<at::Device> device) {
+  // Only CPU tensors can be pinned
+  if (!self.is_cpu()) {
+    return false;
+  }
+  static auto op = c10::Dispatcher::singleton()
+    .findSchemaOrThrow("aten::is_pinned", "")
+    .typed<bool (const Tensor&, c10::optional<at::Device>)>();
+  // TODO: fetch scalar type from Tensor? But it doesn't really matter...
+  DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(at::kCUDA)));
+  return op.redispatch(_dk, self, device);
+}
+
+at::Tensor _pin_memory(const Tensor& self, c10::optional<at::Device> device) {
+  TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned");
+  static auto op = c10::Dispatcher::singleton()
+    .findSchemaOrThrow("aten::_pin_memory", "")
+    .typed<Tensor (const Tensor&, c10::optional<at::Device>)>();
+  DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(at::kCUDA)));
+  return op.redispatch(_dk, self, device);
+}
+
 TORCH_LIBRARY_IMPL(aten, BackendSelect, m) {
   ${backend_select_function_registrations};
+  m.impl(TORCH_SELECTIVE_NAME("aten::is_pinned"), TORCH_FN(is_pinned));
+  m.impl(TORCH_SELECTIVE_NAME("aten::_pin_memory"), TORCH_FN(_pin_memory));
 }
 
 } // namespace
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 172252edca3fe..9a6d39fd71729 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -423,7 +423,7 @@ TEST(BasicTest, FactoryMethodsTest) {
   ASSERT_EQ(tensor1.device(), at::kCPU);
   ASSERT_FALSE(tensor1.requires_grad());
   // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  ASSERT_ANY_THROW(tensor1.is_pinned());
+  ASSERT_FALSE(tensor1.is_pinned());
 #endif // ATEN_CPU_STATIC_DISPATCH
 
   if (torch::cuda::is_available()) {
@@ -454,11 +454,7 @@ TEST(BasicTest, FactoryMethodsTest) {
     // This is a bug
     // Issue https://github.com/pytorch/pytorch/issues/30405
     ASSERT_FALSE(tensor1.requires_grad());
-
-    // This will cause an exception
-    // Issue https://github.com/pytorch/pytorch/issues/30405
-    // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-    ASSERT_ANY_THROW(tensor1.is_pinned());
+    ASSERT_FALSE(tensor1.is_pinned());
   }
 
   // Test _like variants
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 5abb1a1446da5..ffbdd046c76b2 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2211,3 +2211,6 @@
 
 - name: segment_reduce(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, int axis=0, bool unsafe=False, Scalar? initial=None) -> Tensor
   data: _segment_reduce_backward(grad, result, data, reduce, lengths)
+
+- name: _pin_memory(Tensor self, Device? device=None) -> Tensor
+  self: grad

From c18017190bb7ec5a42b99fc5a5c8e082bcaab839 Mon Sep 17 00:00:00 2001
From: Xiao Wang <24860335+xwang233@users.noreply.github.com>
Date: Mon, 12 Jul 2021 09:16:52 -0700
Subject: [PATCH 090/122] Relax some linalg test tolerances (#61101)

Summary:
We are seeing some test failures on A100 machine, though TF32 matmul is not involved in these cases.

I tried `svd_lowrank` test. It passed while testing itself, but failed when I run the whole test suite. It's probably some random seed issue. Relax test tolerance would be much easier to do.

Some SVD tests failed when we compare CPU float32 vs GPU float32. Since linear algebra are sort of unstable at single precision, comparing two single precision results may give some false positives. So we calculate CPU results in float64 or complex128, which is much more accurate.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61101

Reviewed By: ngimel

Differential Revision: D29593483

Pulled By: mruberry

fbshipit-source-id: 3df651e3cca1b0effc1a4ae29d4f26b1cb4082ed
---
 test/test_linalg.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/test/test_linalg.py b/test/test_linalg.py
index 779d13264ce5b..fe15f8b6a7eb7 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -2753,7 +2753,7 @@ def run_subtest(actual_rank, matrix_size, batches, device, svd_lowrank, **option
             # check if u, s, v is a SVD
             u, s, v = u[..., :q], s[..., :q], v[..., :q]
             A = u.matmul(s.diag_embed()).matmul(v.transpose(-2, -1))
-            self.assertEqual(A, a)
+            self.assertEqual(A, a, rtol=1e-7, atol=2e-7)
 
             # check if svd_lowrank produces same singular values as torch.svd
             U, S, V = torch.svd(a)
@@ -2811,8 +2811,11 @@ def test_svd_complex(self, device, dtype):
         self.assertEqual(t, t2)
 
     def _test_svd_helper(self, shape, some, col_maj, device, dtype):
-        cpu_tensor = torch.randn(shape, device='cpu').to(dtype)
-        device_tensor = cpu_tensor.to(device=device)
+        # To have accurate tests and less false positives on different CPUs and GPUs,
+        # we use double or complex double accuracy for CPU reference.
+        cpu_dtype = torch.complex128 if dtype.is_complex else torch.float64
+        cpu_tensor = torch.randn(shape, device='cpu', dtype=cpu_dtype)
+        device_tensor = cpu_tensor.to(device=device, dtype=dtype)
         if col_maj:
             cpu_tensor = cpu_tensor.t()
             device_tensor = device_tensor.t()
@@ -2826,7 +2829,7 @@ def _test_svd_helper(self, shape, some, col_maj, device, dtype):
         #   then the corresponding column of the V has to be changed.
         # Thus here we only compare result[..., :m].abs() from CPU and device.
         for x, y in zip(cpu_result, device_result):
-            self.assertEqual(x[..., :m].abs(), y[..., :m].abs(), atol=1e-5, rtol=0)
+            self.assertEqual(x[..., :m].abs(), y[..., :m].abs(), exact_dtype=False)
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack

From d46689a2017cc046abdc938247048952df4f6de7 Mon Sep 17 00:00:00 2001
From: Kushashwa Ravi Shrimali <kushashwaravishrimali@gmail.com>
Date: Mon, 12 Jul 2021 09:19:23 -0700
Subject: [PATCH 091/122] OpInfo reference tests for `add` and `sub` (#61169)

Summary:
This PR adds OpInfo reference checks for `add, sub`. See https://github.com/pytorch/pytorch/issues/54261

cc: mruberry pmeier

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61169

Reviewed By: iramazanli

Differential Revision: D29625702

Pulled By: mruberry

fbshipit-source-id: c5e536ab52865890990353c5c862b44b5a16ed20
---
 torch/testing/_internal/common_methods_invocations.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index d8eea09102ef0..757aee0be5226 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -4866,6 +4866,8 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                                 dtypes=[torch.cdouble]),
                    )),
     OpInfo('add',
+           # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
+           ref=lambda input, other, *, alpha=1: np.add(input, np.multiply(alpha, other)),
            dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
            assert_autodiffed=True,
            sample_inputs_func=partial(sample_inputs_binary_pwise, alpha=2),
@@ -4878,6 +4880,8 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            supports_forward_ad=True,
            sample_inputs_func=sample_inputs_binary_pwise),
     OpInfo('sub',
+           # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
+           ref=lambda input, other, *, alpha=1: np.subtract(input, np.multiply(alpha, other)),
            aliases=('subtract',),
            dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16),
            assert_autodiffed=True,

From 99959fe3f5ae0867e1f67fceaf6c6789b028770a Mon Sep 17 00:00:00 2001
From: Vitaly Fedyunin <vitaly.fedyunin@gmail.com>
Date: Mon, 12 Jul 2021 10:02:45 -0700
Subject: [PATCH 092/122] [DataLoader] Adding demux and mux DataPipe-s (#61234)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61234

* **#61234 [WIP] Adding demux and mux DataPipe API examples**

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D29588836

Pulled By: VitalyFedyunin

fbshipit-source-id: 523d12ea6be7507d706b4c6d8827ec1ac4ccabc3
---
 test/test_datapipe.py                        | 20 +++++
 torch/utils/data/datapipes/iter/combining.py | 35 ++++++++
 torch/utils/data/standard_pipes.ipynb        | 87 ++++++++++++++++++++
 3 files changed, 142 insertions(+)

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index f42faf0cb942f..d6c4382eb8d23 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -76,6 +76,15 @@ def create_temp_dir_and_files():
             (temp_sub_dir, temp_sub_file1_name, temp_sub_file2_name)]
 
 
+class NumbersDataset(IterDataPipe):
+    def __init__(self, size=10):
+        self.size = size
+
+    def __iter__(self):
+        for i in range(self.size):
+            yield i
+
+
 class TestIterableDataPipeBasic(TestCase):
 
     def setUp(self):
@@ -255,6 +264,17 @@ def test_groupbykey_iterable_datapipe(self):
                 rec[i][1].close()
         self.assertEqual(count, 8)
 
+    def test_demux_mux_datapipe(self):
+        numbers = NumbersDataset(10)
+        n1, n2 = numbers.demux(2, lambda x: x % 2)
+        self.assertEqual([0, 2, 4, 6, 8], list(n1))
+        self.assertEqual([1, 3, 5, 7, 9], list(n2))
+
+        numbers = NumbersDataset(10)
+        n1, n2, n3 = numbers.demux(3, lambda x: x % 3)
+        n = n1.mux(n2, n3)
+        self.assertEqual(list(range(10)), list(n))
+
 
 class FileLoggerSimpleHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
     def __init__(self, *args, logfile=None, **kwargs):
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index f6b90f0117d79..0693b1f0ad6de 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -1,3 +1,5 @@
+import functools
+
 from torch.utils.data import IterDataPipe, functional_datapipe
 from typing import Iterator, Optional, Sized, Tuple, TypeVar
 
@@ -60,6 +62,39 @@ def __new__(cls, datapipe, instances):
         return [IterateBuffer(buffer) for i in range(instances)]
 
 
+@functional_datapipe('demux')
+class DemultiplexerIterDataPipe(IterDataPipe):
+
+    def __new__(cls, datapipe, instances, classifier_fn):
+        result = []
+        buffer = list(datapipe)
+
+        def filter_fn(classifier_fn, i, x):
+            return classifier_fn(x) == i
+        return [IterateBuffer(buffer).filter(functools.partial(filter_fn, classifier_fn, i)) for i in range(instances)]
+
+@functional_datapipe('mux')
+class MultiplexerIterDataPipe(IterDataPipe):
+
+    def __init__(self, *datapipes):
+        self.datapipes = datapipes
+
+    def __iter__(self):
+        iterators = [iter(x) for x in self.datapipes]
+        finished = {}
+        had_more = True
+        while had_more:
+            had_more = False
+            for i in range(len(iterators)):
+                if i not in finished:
+                    try:
+                        value = iterators[i].__next__()
+                        had_more = True
+                        yield value
+                    except StopIteration:
+                        finished[i] = 1
+
+
 @functional_datapipe('zip')
 class ZipIterDataPipe(IterDataPipe[Tuple[T_co]]):
     r""" :class:`ZipIterDataPipe`.
diff --git a/torch/utils/data/standard_pipes.ipynb b/torch/utils/data/standard_pipes.ipynb
index e567001eb2a16..3f58a365dffbd 100644
--- a/torch/utils/data/standard_pipes.ipynb
+++ b/torch/utils/data/standard_pipes.ipynb
@@ -1001,6 +1001,93 @@
     "    print(i)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Demultiplexer\n",
+    "\n",
+    "Function: `demux`\n",
+    "\n",
+    "Description: \n",
+    "\n",
+    "Alternatives:\n",
+    "\n",
+    "Arguments:\n",
+    " \n",
+    "Example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1\n",
+      "4\n",
+      "7\n"
+     ]
+    }
+   ],
+   "source": [
+    "dp = ExampleIterPipe(10)\n",
+    "dp1, dp2, dp3 = dp.demux(3, lambda x: x % 3)\n",
+    "for i in dp2:\n",
+    "    print(i)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multiplexer\n",
+    "\n",
+    "Function: `mux`\n",
+    "\n",
+    "Description: \n",
+    "\n",
+    "Alternatives:\n",
+    "\n",
+    "Arguments:\n",
+    " \n",
+    "Example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "0\n",
+      "0\n",
+      "1\n",
+      "10\n",
+      "100\n",
+      "2\n",
+      "20\n",
+      "200\n"
+     ]
+    }
+   ],
+   "source": [
+    "dp1 = ExampleIterPipe(3)\n",
+    "dp2 = ExampleIterPipe(3).map(lambda x: x * 10)\n",
+    "dp3 = ExampleIterPipe(3).map(lambda x: x * 100)\n",
+    "\n",
+    "dp = dp1.mux(dp2, dp3)\n",
+    "for i in dp:\n",
+    "    print(i)"
+   ]
+  },
   {
    "source": [
     "## Concat\n",

From 5a20c56ebce3426397210e91693fbbeade8b46ba Mon Sep 17 00:00:00 2001
From: Ansha Yu <ansha@fb.com>
Date: Mon, 12 Jul 2021 10:08:21 -0700
Subject: [PATCH 093/122] [static runtime] Remove hasOperation() check (#61496)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61496

glow::FusionGroup is JitOnlyOperator that produces an Operation when passed a Node* https://fburl.com/ybwfn3bl

hasOperation doesn't return true in that case https://fburl.com/19wd10aw

by removing the hasOperation() check, the Operation gets successfully materialized, and static runtime enables successfully and runs ok. Will check that the outputs match with jit interpreter

Test Plan:
Test with 281805158_2
```
./buck-out/gen/admarket/lib/ranking/prediction_replayer/replayer --model_inference_type_target=DISAGG_ACCELERATOR --prediction_replayer_force_model_type=inline_cvr_post_imp_model --prediction_replayer_force_model=281805158_2 --prediction_replayer_target_tier=127.0.0.1:7447 --prediction_replayer_input_stream_filename=/data/users/ansha/tmp/adfinder/filter_requests_inline_cvr_post_imp_model_1000_2021_04_29 --ignore_model_id_mismatch --check_performance --fully_remote_sr_connection_options="overall_timeout:10000000,processing_timeout:10000000" --use_new_encoding_for_ads_services --use_new_encoding_from_model_id_to_shard_id --sigrid_force_model_dir=/data/users/ansha/tmp/adfinder/281805158_2/ --sigrid_predictor_model_suffix=.predictor.disagg.local —use_new_encoding_from_model_id_to_shard_id=true --prediction_replayer_force_model_kind=19 --pytorch_predictor_static_runtime_enable=true --prediction_replayer_target_qps=1
```

```
NNPI_LOG_LEVEL=0 USE_INF_API=1 ./buck-out/gen/sigrid/predictor/sigrid_remote_predictor_glow_nnpi \
  --force_models=281805158_2 \
  --sigrid_predictor_model_suffix=.predictor.disagg.remote_other \
  --gflags_config_path=sigrid/predictor/gflags/predictor_gflags_ads_perf_glow_nnpi_pyper_v1 \
  --smc_server_port=7447 \
  --sigrid_predictor_tier_name=sigrid.predictor.perf.dianshi_staticruntime_debug_0604.test.storage \
  --predictor_storage_smc_tier=sigrid.predictor.perf.dianshi_staticruntime_debug_0604.test.storage \
  --predictor_storage_smc_tier_v2=sigrid.predictor.perf.dianshi_staticruntime_debug_0604.test.storage \
  --torch_glow_min_fusion_group_size=30 \
  --glow_enable_sanitize_inputs=100 \
  --sigrid_force_model_dir=/data/users/ansha/tmp/adfinder/281805158_2/ \
  --pytorch_predictor_static_runtime_enable=true \
  --pytorch_predictor_glow_enable=true \
  --pytorch_predictor_enable_loading_xl_format_on_cpu=false \
  --pytorch_disagg_acc_input_dump_path=/tmp/
```

Reviewed By: hlu1

Differential Revision: D29647043

fbshipit-source-id: 8ce6dc0f4f0464b65ca6a8c9d42e3d8bb392e66e
---
 torch/csrc/jit/runtime/static/impl.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 8a24b84e4b0f8..4f796ffa56930 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -1357,7 +1357,6 @@ ProcessedNode::ProcessedNode(
   }
   {
     const Operator& op = node->getOperator();
-    TORCH_CHECK(op.hasOperation());
     op_ = op.getOperation(node);
     VLOG(1) << "Fallback interpreter for node: " << PrintNode(node);
   }

From 583b045fc3e37b4bf44449fad5ec0a1f182daaf8 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Mon, 12 Jul 2021 10:12:56 -0700
Subject: [PATCH 094/122] Make .contiguous(memory_format) call
 .clone(memory_format) (#61456)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61456

functorch is unable to `vmap(grad(f))` when `f` contains a `.contiguous`
call. This is because `.contiguous` (when it is not a no-op) decomposes
to `.copy_` under grad and the `.copy_` is not compatible with vmap.

The fix for this is to have `.contiguous` call `.clone` instead of
`.copy_`. `clone` is a primitive w.r.t. to autograd, so `grad`
decomposes contiguous into clone.

Perf testing (forward pass)
- [script and
output](https://gist.github.com/zou3519/294f583b9c5d7bdf234d5295f97fb02e)
- The instruction count increased from 774479 to 781379. This is because
we're now calling .clone(), which does an additional dispatch. We could
optimize the implementation of clone() to not dispatch on .copy_() in
the future if we really care about this.

Perf testing (backward pass)
- [script and
output](https://gist.github.com/zou3519/6fbdb121de6342334192d55c8a72276a)
- The instruction count decreased from 5402648 to 5335977. This is
because the [backward for
.clone](https://github.com/pytorch/pytorch/blob/9b908ab0d0a947d89ac3137f8c4a05a87c35f568/tools/autograd/derivatives.yaml#L383)
is a lot simpler than the [backward for
copy_](https://github.com/pytorch/pytorch/blob/9b908ab0d0a947d89ac3137f8c4a05a87c35f568/torch/csrc/autograd/functions/tensor.cpp#L37-L41)
- The backward for .clone() and .copy_() end up doing the same thing for
contiguous (from reading the code above, they both do no-op copies).

Test Plan:
- wait for existing tests (test_view_ops have the tests)
- functorch isn't tested in PyTorch CI yet.
- Taking suggestions on how to write a test for this. I'm thinking we
could use LoggingTensor from #59760 (because it logs underneath
autograd) and test that clone is called instead of copy_ but I didn't
want to refactor it into a utility

Reviewed By: soulitzer

Differential Revision: D29636859

Pulled By: zou3519

fbshipit-source-id: 97eb56bfae1c4bb31612dc9d06536019f21d69a6
---
 aten/src/ATen/native/TensorProperties.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp
index c23eee9719fb3..40a4d6219e7e5 100644
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@@ -65,8 +65,7 @@ Tensor contiguous(const Tensor& self, MemoryFormat memory_format) {
       memory_format != MemoryFormat::Preserve,
       "preserve memory format is unsupported by the contiguous operator");
 
-  auto result = at::empty_like(self, self.options(), memory_format);
-  return result.copy_(self);
+  return self.clone(memory_format);
 }
 
 bool is_set_to(const Tensor& self, const Tensor& src) {

From 25a705610fd72bce08e4c8c601e2541b529c5f25 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 12 Jul 2021 10:23:30 -0700
Subject: [PATCH 095/122] ENH Adds support for no-batch dim in
 AdaptiveAvgPool1d (#61264)

Summary:
Towards https://github.com/pytorch/pytorch/issues/60585

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61264

Reviewed By: iramazanli

Differential Revision: D29615292

Pulled By: jbschlosser

fbshipit-source-id: 826d1c87d67261a7211270e90e3a1022bbbe37bd
---
 aten/src/ATen/native/Pooling.cpp     |  6 +++---
 torch/nn/modules/pooling.py          |  9 +++++++--
 torch/testing/_internal/common_nn.py | 18 ++++++++++++++++++
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/Pooling.cpp b/aten/src/ATen/native/Pooling.cpp
index 750089e8d4f3c..2328949c21dbe 100644
--- a/aten/src/ATen/native/Pooling.cpp
+++ b/aten/src/ATen/native/Pooling.cpp
@@ -22,14 +22,14 @@ static void check1d(
 }
 
 Tensor adaptive_avg_pool1d(const Tensor & self, IntArrayRef output_size) {
-  checkDim("adaptive_avg_pool1d", TensorArg(self, "self", 1), 3);
+  checkDimRange("adaptive_avg_pool1d", TensorArg(self, "self", 1), 2, 4 /* exclusive */);
   check1d("adaptive_avg_pool1d", "output_size", output_size);
 
   auto output = at::adaptive_avg_pool2d(
-      self.unsqueeze(2),
+      self.unsqueeze(-2),
       {1, output_size[0]});
 
-  return output.squeeze(2);
+  return output.squeeze(-2);
 }
 
 std::tuple<Tensor,Tensor> adaptive_max_pool1d(const Tensor & self, IntArrayRef output_size) {
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index d474050091f67..8dd5bb01a9fd4 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -1078,11 +1078,16 @@ def extra_repr(self) -> str:
 class AdaptiveAvgPool1d(_AdaptiveAvgPoolNd):
     r"""Applies a 1D adaptive average pooling over an input signal composed of several input planes.
 
-    The output size is H, for any input size.
+    The output size is :math:`L_{out}`, for any input size.
     The number of output features is equal to the number of input planes.
 
     Args:
-        output_size: the target output size H
+        output_size: the target output size :math:`L_{out}`.
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+          :math:`L_{out}=\text{output\_size}`.
 
     Examples:
         >>> # target output size of 5
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 016c3690bbfe9..3916c308d2989 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -1248,6 +1248,16 @@ def fractional_max_pool3d_test(test_case):
             fullname='FractionalMaxPool3d_asymsize')
 
 
+def single_batch_reference_fn(input, parameters, module):
+    """Reference function for modules supporting no batch dimensions.
+
+    The module is passed the input and target in batched form with a single item.
+    The output is squeezed to compare with the no-batch input.
+    """
+    single_batch_input = input.unsqueeze(0)
+    with freeze_rng_state():
+        return module(single_batch_input).squeeze(0)
+
 new_module_tests = [
     poissonnllloss_no_reduce_test(),
     bceloss_no_reduce_test(),
@@ -3188,6 +3198,14 @@ def fractional_max_pool3d_test(test_case):
         cpp_constructor_args='torch::nn::AdaptiveAvgPool1dOptions(3)',
         input_fn=lambda: torch.rand(1, 3, 5),
     ),
+    dict(
+        module_name='AdaptiveAvgPool1d',
+        constructor_args=(3,),
+        cpp_constructor_args='torch::nn::AdaptiveAvgPool1dOptions(3)',
+        input_fn=lambda: torch.rand(3, 5),
+        reference_fn=single_batch_reference_fn,
+        desc='no_batch_dim',
+    ),
     dict(
         module_name='AdaptiveAvgPool1d',
         constructor_args=(1,),

From f2857883c4c148ced4f920431b38532fe8081b73 Mon Sep 17 00:00:00 2001
From: Vitaly Fedyunin <vitaly.fedyunin@gmail.com>
Date: Mon, 12 Jul 2021 10:27:42 -0700
Subject: [PATCH 096/122] Add DataPipes Graph Functions (#61235)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61235

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D29588834

Pulled By: VitalyFedyunin

fbshipit-source-id: e0331d6e1fc2a3f8b6211aac83965bcf13165161
---
 test/test_datapipe.py     | 66 ++++++++++++++++++++++++++++++---------
 torch/utils/data/graph.py | 44 ++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 15 deletions(-)
 create mode 100644 torch/utils/data/graph.py

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index d6c4382eb8d23..fbf4eb9918ed6 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -24,6 +24,7 @@
 import torch
 import torch.nn as nn
 import torch.utils.data.datapipes as dp
+import torch.utils.data.graph
 
 from torch.testing._internal.common_utils import (TestCase, run_tests)
 from torch.utils.data import (
@@ -39,6 +40,17 @@
     HAS_TORCHVISION = False
 skipIfNoTorchVision = skipIf(not HAS_TORCHVISION, "no torchvision")
 
+try:
+    import dill
+    # XXX: By default, dill writes the Pickler dispatch table to inject its
+    # own logic there. This globally affects the behavior of the standard library
+    # pickler for any user who transitively depends on this module!
+    # Undo this extension to avoid altering the behavior of the pickler globally.
+    dill.extend(use_dill=False)
+    HAS_DILL = True
+except ImportError:
+    HAS_DILL = False
+skipIfNoDill = skipIf(not HAS_DILL, "no dill")
 
 T_co = TypeVar('T_co', covariant=True)
 
@@ -76,15 +88,6 @@ def create_temp_dir_and_files():
             (temp_sub_dir, temp_sub_file1_name, temp_sub_file2_name)]
 
 
-class NumbersDataset(IterDataPipe):
-    def __init__(self, size=10):
-        self.size = size
-
-    def __iter__(self):
-        for i in range(self.size):
-            yield i
-
-
 class TestIterableDataPipeBasic(TestCase):
 
     def setUp(self):
@@ -136,7 +139,7 @@ def test_loadfilesfromdisk_iterable_datapipe(self):
                 rec[1].close()
         self.assertEqual(count, len(self.temp_files))
 
-
+    # TODO(VitalyFedyunin): Generates unclosed buffer warning, need to investigate
     def test_readfilesfromtar_iterable_datapipe(self):
         temp_dir = self.temp_dir.name
         temp_tarfile_pathname = os.path.join(temp_dir, "test_tar.tar")
@@ -163,7 +166,7 @@ def test_readfilesfromtar_iterable_datapipe(self):
                 self.assertEqual(data_ref[1].read(), f.read())
             data_ref[1].close()
 
-
+    # TODO(VitalyFedyunin): Generates unclosed buffer warning, need to investigate
     def test_readfilesfromzip_iterable_datapipe(self):
         temp_dir = self.temp_dir.name
         temp_zipfile_pathname = os.path.join(temp_dir, "test_zip.zip")
@@ -231,7 +234,7 @@ def _helper(prior_dp, dp, channel_first=False):
         datapipe4.add_handler(_png_decoder)
         _helper(cached, datapipe4, channel_first=True)
 
-
+    # TODO(VitalyFedyunin): Generates unclosed buffer warning, need to investigate
     def test_groupbykey_iterable_datapipe(self):
         temp_dir = self.temp_dir.name
         temp_tarfile_pathname = os.path.join(temp_dir, "test_tar.tar")
@@ -460,7 +463,8 @@ def _worker_init_fn(worker_id):
 
 class TestFunctionalIterDataPipe(TestCase):
 
-    def test_picklable(self):
+    # TODO(VitalyFedyunin): If dill installed this test fails
+    def _test_picklable(self):
         arr = range(10)
         picklable_datapipes: List[Tuple[Type[IterDataPipe], IterDataPipe, Tuple, Dict[str, Any]]] = [
             (dp.iter.Map, IDP(arr), (), {}),
@@ -540,7 +544,8 @@ def fn(item, dtype=torch.float, *, sum=False):
         for x, y in zip(map_dp_nl, input_dp_nl):
             self.assertEqual(x, torch.tensor(y, dtype=torch.float))
 
-    def test_map_datapipe_nested_level(self):
+    # TODO(VitalyFedyunin): If dill installed this test fails
+    def _test_map_datapipe_nested_level(self):
 
         input_dp = IDP([list(range(10)) for _ in range(3)])
 
@@ -901,7 +906,8 @@ def test_zip_datapipe(self):
 
 
 class TestFunctionalMapDataPipe(TestCase):
-    def test_picklable(self):
+    # TODO(VitalyFedyunin): If dill installed this test fails
+    def _test_picklable(self):
         arr = range(10)
         picklable_datapipes: List[
             Tuple[Type[MapDataPipe], MapDataPipe, Tuple, Dict[str, Any]]
@@ -1277,6 +1283,36 @@ def __iter__(self) -> Iterator[T]:
         with runtime_validation_disabled():
             self.assertEqual(list(d for d in dp), ds)
 
+class NumbersDataset(IterDataPipe):
+    def __init__(self, size=10):
+        self.size = size
+
+    def __iter__(self):
+        for i in range(self.size):
+            yield i
+
+
+class TestGraph(TestCase):
+    @skipIfNoDill
+    def test_simple_traverse(self):
+        numbers_dp = NumbersDataset(size=50)
+        mapped_dp = numbers_dp.map(lambda x: x * 10)
+        graph = torch.utils.data.graph.traverse(mapped_dp)
+        expected : Dict[Any, Any] = {mapped_dp: {numbers_dp: {}}}
+        self.assertEqual(expected, graph)
+
+    # TODO(VitalyFedyunin): This test is incorrect because of 'buffer' nature
+    # of the fork fake implementation, update fork first and fix this test too
+    @skipIfNoDill
+    def test_traverse_forked(self):
+        numbers_dp = NumbersDataset(size=50)
+        dp0, dp1, dp2 = numbers_dp.fork(3)
+        dp0_upd = dp0.map(lambda x: x * 10)
+        dp1_upd = dp1.filter(lambda x: x % 3 == 1)
+        combined_dp = dp0_upd.mux(dp1_upd, dp2)
+        graph = torch.utils.data.graph.traverse(combined_dp)
+        expected = {combined_dp: {dp0_upd: {dp0: {}}, dp1_upd: {dp1: {}}, dp2: {}}}
+        self.assertEqual(expected, graph)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/utils/data/graph.py b/torch/utils/data/graph.py
new file mode 100644
index 0000000000000..e244a85f1377e
--- /dev/null
+++ b/torch/utils/data/graph.py
@@ -0,0 +1,44 @@
+import io
+import pickle
+
+from torch.utils.data import IterableDataset
+
+from typing import Any, Dict
+
+reduce_ex_hook = None
+
+
+def stub_unpickler():
+    return "STUB"
+
+# TODO(VitalyFedyunin): Make sure it works without dill module installed
+def list_connected_datapipes(scan_obj):
+
+    f = io.BytesIO()
+    p = pickle.Pickler(f)  # Not going to work for lambdas, but dill infinite loops on typing and can't be used as is
+
+    def stub_pickler(obj):
+        return stub_unpickler, ()
+
+    captured_connections = []
+
+    def reduce_hook(obj):
+        if obj == scan_obj:
+            raise NotImplementedError
+        else:
+            captured_connections.append(obj)
+            return stub_unpickler, ()
+
+    # TODO(VitalyFedyunin):  Better do it as `with` context for safety
+    IterableDataset.set_reduce_ex_hook(reduce_hook)
+    p.dump(scan_obj)
+    IterableDataset.set_reduce_ex_hook(None)
+    return captured_connections
+
+
+def traverse(datapipe):
+    items = list_connected_datapipes(datapipe)
+    d: Dict[Any, Any] = {datapipe: {}}
+    for item in items:
+        d[datapipe].update(traverse(item))
+    return d

From 3faf6a715dcdffeb03ccccf35c74ea57cd781634 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Mon, 12 Jul 2021 11:00:12 -0700
Subject: [PATCH 097/122] [special] migrate log_softmax (#60512)

Summary:
Reference: https://github.com/pytorch/pytorch/issues/50345

Rendered Docs: https://14335157-65600975-gh.circle-artifacts.com/0/docs/special.html#torch.special.log_softmax

Pull Request resolved: https://github.com/pytorch/pytorch/pull/60512

Reviewed By: iramazanli

Differential Revision: D29626262

Pulled By: mruberry

fbshipit-source-id: c42d4105531ffb004f11f1ba6ae50be19bc02c91
---
 aten/src/ATen/core/aten_interned_strings.h    |  1 -
 aten/src/ATen/core/interned_strings.h         |  2 ++
 aten/src/ATen/native/SoftMax.cpp              |  4 +++
 aten/src/ATen/native/native_functions.yaml    |  4 +++
 docs/source/special.rst                       |  1 +
 torch/csrc/api/include/torch/special.h        | 12 +++++++++
 torch/csrc/jit/passes/normalize_ops.cpp       |  1 +
 torch/nn/functional.py                        |  4 +--
 torch/overrides.py                            |  1 +
 torch/special/__init__.py                     | 27 +++++++++++++++++++
 .../_internal/common_methods_invocations.py   | 27 ++++++++++++++++---
 11 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index 084061e2c909f..77c63cbffe621 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -439,7 +439,6 @@ _(aten, log_normal) \
 _(aten, log_sigmoid) \
 _(aten, log_sigmoid_backward) \
 _(aten, log_sigmoid_forward) \
-_(aten, log_softmax) \
 _(aten, _log_softmax) \
 _(aten, _log_softmax_backward_data) \
 _(aten, logcumsumexp) \
diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index e03c7620b3b7f..7dad347ba25a9 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -354,6 +354,8 @@ namespace c10 {
   _(aten, special_i0e)               \
   _(aten, special_i1)                \
   _(aten, special_i1e)               \
+  _(aten, log_softmax)               \
+  _(aten, special_log_softmax)       \
   _(aten, special_zeta)              \
   _(aten, has_torch_function)        \
   _(aten, hardswish)                 \
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index d1258a82a2326..a445c6ddcabba 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -299,6 +299,10 @@ Tensor log_softmax(const Tensor& input_, const int64_t dim_, c10::optional<Scala
   return result;
 }
 
+Tensor special_log_softmax(const Tensor& input, const int64_t dim, c10::optional<ScalarType> dtype) {
+  return at::log_softmax(input, dim, dtype);
+}
+
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(softmax_lastdim_kernel);
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index b3743a4be1150..2ca5d5711619c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -9878,6 +9878,10 @@
   python_module: special
   variants: function
 
+- func: special_log_softmax(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  python_module: special
+  variants: function
+
 ## Functions related to the fast Fourier transform and the torch.fft namespace
 # Note [FFT namespace binding]
 # Functions in the fft python module should have their names start with
diff --git a/docs/source/special.rst b/docs/source/special.rst
index 03306a5c5b33a..db772988aba44 100644
--- a/docs/source/special.rst
+++ b/docs/source/special.rst
@@ -36,6 +36,7 @@ Functions
 .. autofunction:: logit
 .. autofunction:: logsumexp
 .. autofunction:: log1p
+.. autofunction:: log_softmax
 .. autofunction:: ndtr
 .. autofunction:: ndtri
 .. autofunction:: round
diff --git a/torch/csrc/api/include/torch/special.h b/torch/csrc/api/include/torch/special.h
index be0234b9dc267..55153a52c12c5 100644
--- a/torch/csrc/api/include/torch/special.h
+++ b/torch/csrc/api/include/torch/special.h
@@ -416,4 +416,16 @@ inline Tensor& log1p_out(Tensor& result, const Tensor& self) {
   return torch::special_log1p_out(result, self);
 }
 
+/// Computes log followed by softmax(x) of the input
+/// See https://pytorch.org/docs/master/special.html#torch.special.log_softmax.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, 128, dtype=kDouble);
+/// torch::special::log_softmax(t, 0);
+/// ```
+inline Tensor log_softmax(const Tensor& self, int64_t dim, c10::optional<ScalarType> dtype) {
+  return torch::special_log_softmax(self, dim, dtype);
+}
+
 }} // torch::special
diff --git a/torch/csrc/jit/passes/normalize_ops.cpp b/torch/csrc/jit/passes/normalize_ops.cpp
index f3812c5560079..e9c462f044a09 100644
--- a/torch/csrc/jit/passes/normalize_ops.cpp
+++ b/torch/csrc/jit/passes/normalize_ops.cpp
@@ -124,6 +124,7 @@ const std::unordered_map<Symbol, Symbol>& getOperatorAliasMap() {
       {aten::special_digamma, aten::digamma},
       {aten::special_psi, aten::digamma},
       {aten::special_i0, aten::i0},
+      {aten::special_log_softmax, aten::log_softmax},
       {aten::orgqr, aten::linalg_householder_product},
       {aten::special_gammaln, aten::lgamma}};
   return alias_map;
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index d940b423e7ffa..1c2aa32aa1dc4 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1748,7 +1748,7 @@ def log_softmax(input: Tensor, dim: Optional[int] = None, _stacklevel: int = 3,
     r"""Applies a softmax followed by a logarithm.
 
     While mathematically equivalent to log(softmax(x)), doing these two
-    operations separately is slower, and numerically unstable. This function
+    operations separately is slower and numerically unstable. This function
     uses an alternative formulation to compute the output and gradient correctly.
 
     See :class:`~torch.nn.LogSoftmax` for more details.
@@ -1757,7 +1757,7 @@ def log_softmax(input: Tensor, dim: Optional[int] = None, _stacklevel: int = 3,
         input (Tensor): input
         dim (int): A dimension along which log_softmax will be computed.
         dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
-          If specified, the input tensor is casted to :attr:`dtype` before the operation
+          If specified, the input tensor is cast to :attr:`dtype` before the operation
           is performed. This is useful for preventing data type overflows. Default: None.
     """
     if has_torch_function_unary(input):
diff --git a/torch/overrides.py b/torch/overrides.py
index 3522c3497f711..d0bfd37b37c8f 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -918,6 +918,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.special.logit: lambda input: -1,
         torch.special.logsumexp: lambda input, dim, keepdim=False, out=None: -1,
         torch.special.log1p: lambda input: -1,
+        torch.special.log_softmax: lambda input, dim, dtype=None: -1,
         torch.special.round: lambda input: -1,
         torch.special.sinc: lambda input: -1,
         torch.special.ndtri: lambda input: -1,
diff --git a/torch/special/__init__.py b/torch/special/__init__.py
index a516dd3a4205f..99993a31b8d57 100644
--- a/torch/special/__init__.py
+++ b/torch/special/__init__.py
@@ -512,6 +512,33 @@
 Alias for :func:`torch.round`.
 """)
 
+log_softmax = _add_docstr(_special.special_log_softmax,
+                          r"""
+log_softmax(input, dim, *, dtype=None) -> Tensor
+Computes softmax followed by a logarithm.
+
+While mathematically equivalent to log(softmax(x)), doing these two
+operations separately is slower and numerically unstable. This function
+is computed as:
+
+.. math::
+    \text{log\_softmax}(x_{i}) = \log\left(\frac{\exp(x_i) }{ \sum_j \exp(x_j)} \right)
+""" + r"""
+
+Args:
+    input (Tensor): input
+    dim (int): A dimension along which log_softmax will be computed.
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        If specified, the input tensor is cast to :attr:`dtype` before the operation
+        is performed. This is useful for preventing data type overflows. Default: None.
+
+Example::
+    >>> t = torch.ones(2, 2)
+    >>> torch.special.log_softmax(t, 0)
+    tensor([[-0.6931, -0.6931],
+            [-0.6931, -0.6931]])
+""")
+
 zeta = _add_docstr(_special.special_zeta,
                    r"""
 zeta(input, other, *, out=None) -> Tensor
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 757aee0be5226..9f996d724cd9c 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -3623,13 +3623,13 @@ def sample_inputs_log_softmax(op_info, device, dtype, requires_grad, with_dtype=
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
     if with_dtype:
-        cases = (((S, S, S), (1, torch.float64)),)
+        cases = (((S, S, S), (1, ), {'dtype': torch.float64}),)
     else:
-        cases = (((S, S, S), (1,)),)  # type:ignore[assignment]
+        cases = (((S, S, S), (1,), {}),)  # type:ignore[assignment]
 
     def generator():
-        for shape, args in cases:
-            yield SampleInput(make_arg(shape), args=args)
+        for shape, args, kwargs in cases:
+            yield SampleInput(make_arg(shape), args=args, kwargs=kwargs)
 
     return list(generator())
 
@@ -7662,6 +7662,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
     # is passed or not. Hence two OpInfo entries, one with dtype and other without.
     OpInfo(
         'log_softmax',
+        aliases=('special.log_softmax', 'nn.functional.log_softmax'),
         supports_out=False,
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
@@ -7670,9 +7671,27 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
     OpInfo(
         'log_softmax',
         variant_test_name='dtype',
+        aliases=('special.log_softmax', 'nn.functional.log_softmax'),
         supports_out=False,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         sample_inputs_func=partial(sample_inputs_log_softmax, with_dtype=True),
+        skips=(
+            # NOTE: This should work once https://github.com/pytorch/pytorch/pull/58838 is in
+            # RuntimeError:
+            # Unknown type name 'dtype':
+            # File "<string>", line 2
+            #         def _fn(t0, s0: int, dtype: dtype = torch.float64):
+            #                                     ~~~~~ <--- HERE
+            #             return variant(t0, s0, dtype=torch.float64)
+
+            # 'defaults' is being compiled since it was called from '_fn'
+            # File "<string>", line 2
+            #         def _fn(t0, s0: int, dtype: dtype = torch.float64):
+            #             ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            #             return variant(t0, s0, dtype=torch.float64)
+            #             ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+            SkipInfo('TestJit', 'test_jit_alias_remapping'),
+        ),
         assert_autodiffed=True),
     UnaryUfuncInfo('logit',
                    ref=scipy.special.logit if TEST_SCIPY else _NOTHING,

From 51d18369c3279e5a43efea2cb235ae53d7ffba71 Mon Sep 17 00:00:00 2001
From: Amy He <ahe@fb.com>
Date: Mon, 12 Jul 2021 11:11:48 -0700
Subject: [PATCH 098/122] [1/N] Nnapi backend delegation preprocess (#61499)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61499

Added a preprocess function for the delegate to Nnapi backend (internal and external files).

In the past we had functions and classes for converting to the Nnapi backend. Now, these functions and classes will be wrapped by the delegate API.

### nnapi_backend_preprocess.cpp:

Contains the preprocess function, which uses Pybind to call an existing python function, `convert_model_to_nnapi()`.
- The model is wrapped by a `RecursiveScriptModule`, so that `convert_model_to_nnapi()` can run correctly, since when jumping from Python to C++ to Python, the model loses its original wrapper.
- A tensor, which includes shape, data type, and quantization information, is passed through preprocess's compile_spec to `convert_model_to_nnapi()`.
- Finally, the Nnapi model is serialized for mobile and returned as a string.
### nnapi_backend_lib.cpp:
Contains stub functions for compile and execute, and is necessary for the Nnapi backend to be registered correctly. These will be implemented in a future PR.

**TODO:** implement execute and compile for the delegate API; throw exceptions for incorrect an compile_spec; add OSS tests
**Testing:** Tests were done locally (see D29647123). A simple module was lowered to Nnapi, saved locally, and examined.

ghstack-source-id: 133415234

Test Plan:
Tests were done locally (see D29647123).
TODO: add test in OSS in test_backends.py after CMake is ready.
I ran buck run caffe2:nnapi_backend_example. The model files are saved as nnapi_model.ptl and mobile_model.ptl. I checked that both zip files have expected contents.

Reviewed By: iseeyuan

Differential Revision: D29563351

fbshipit-source-id: 642e349356e38aecc1b9973c285569650c02668c
---
 .../jit/backends/nnapi/nnapi_backend_lib.cpp  | 49 +++++++++++++++++++
 .../nnapi/nnapi_backend_preprocess.cpp        | 48 ++++++++++++++++++
 2 files changed, 97 insertions(+)
 create mode 100644 torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp
 create mode 100644 torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp

diff --git a/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp b/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp
new file mode 100644
index 0000000000000..12545101350b0
--- /dev/null
+++ b/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp
@@ -0,0 +1,49 @@
+#include <torch/csrc/jit/backends/backend.h>
+#include <torch/csrc/jit/backends/backend_exception.h>
+
+namespace torch {
+namespace jit {
+
+// This file has no implementation yet, but the declarations are necessary to
+// register the backend properly and test preprocess
+// TODO T91991928: implement compile() and execute()
+class NnapiBackend : public PyTorchBackendInterface {
+ public:
+  // Constructor.
+  // NOLINTNEXTLINE(modernize-use-equals-default)
+  explicit NnapiBackend() {}
+  // NOLINTNEXTLINE(modernize-use-override)
+  virtual ~NnapiBackend() = default;
+
+  bool is_available() override {
+    return true;
+  }
+
+  // Function stub
+  // TODO: implement compile
+  c10::impl::GenericDict compile(
+      c10::IValue processed,
+      c10::impl::GenericDict method_compile_spec) override {
+    auto handles =
+        c10::Dict<std::string, std::vector<std::tuple<std::string, int64_t>>>();
+    return c10::impl::toGenericDict(handles);
+  }
+
+  // Function stub
+  // TODO: implement execute
+  c10::impl::GenericList execute(
+      c10::IValue handle,
+      c10::impl::GenericList inputs) override {
+    c10::List<at::Tensor> output_list;
+    return c10::impl::toList(output_list);
+  }
+};
+
+namespace {
+constexpr auto backend_name = "nnapi";
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+static auto cls = torch::jit::backend<NnapiBackend>(backend_name);
+} // namespace
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
new file mode 100644
index 0000000000000..89ec208748ac8
--- /dev/null
+++ b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
@@ -0,0 +1,48 @@
+#include <pybind11/pybind11.h>
+#include <torch/csrc/jit/backends/backend.h>
+#include <torch/csrc/jit/backends/backend_preprocess.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+
+namespace py = pybind11;
+
+// Converts model to nnapi and serializes it for mobile
+// Returns a dictionary string with one entry:
+// Key: "NnapiModule"
+// Value: a string of the nnapi module, saved for mobile
+//
+// method_compile_spec should contain an input Tensor with the following format:
+// {"forward": {"inputs": Tensor}}
+c10::IValue preprocess(
+    const torch::jit::Module& mod,
+    const c10::Dict<c10::IValue, c10::IValue>& method_compile_spec,
+    const torch::jit::BackendDebugHandleGenerator& generate_debug_handles) {
+  // Import the python function for converting modules to nnapi
+  py::gil_scoped_acquire gil;
+  py::object pyModule = py::module_::import("torch.backends._nnapi.prepare");
+  py::object pyMethod = pyModule.attr("convert_model_to_nnapi");
+
+  // Wrap the c module in a RecursiveScriptModule and call the python conversion
+  // function on it
+  auto out =
+      py::module::import("torch.jit._recursive").attr("wrap_cpp_module")(mod);
+  out.attr("eval")();
+  // TODO: throw exception if compile_spec doesn't contain inputs
+  torch::Tensor inp =
+      method_compile_spec.at("forward").toGenericDict().at("inputs").toTensor();
+  auto nnapi_pyModel = pyMethod(out, inp);
+
+  // Cast the returned py object and save it for mobile
+  std::stringstream ss;
+  auto nnapi_model = py::cast<torch::jit::Module>(nnapi_pyModel.attr("_c"));
+  nnapi_model._save_for_mobile(ss);
+
+  c10::Dict<c10::IValue, c10::IValue> dict(
+      c10::StringType::get(), c10::StringType::get());
+  dict.insert("NnapiModule", ss.str());
+  return dict;
+}
+
+constexpr auto backend_name = "nnapi";
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+static auto pre_reg =
+    torch::jit::backend_preprocess_register(backend_name, preprocess);

From a52de0dfec4eeb288882cdea683a212f03cf3fc7 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Mon, 12 Jul 2021 11:12:39 -0700
Subject: [PATCH 099/122] .github: Add force_on_cpu tests for windows (#61472)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61472

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Test Plan: Imported from OSS

Reviewed By: walterddr

Differential Revision: D29642893

Pulled By: seemethere

fbshipit-source-id: 2dd2b295c71d79593ad7f71d6160de4042c08b80
---
 .github/scripts/generate_ci_workflows.py              |  3 +++
 .github/scripts/generate_pytorch_test_matrix.py       | 11 +++++++----
 .github/templates/windows_ci_workflow.yml.j2          |  2 ++
 .../periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml |  2 ++
 .github/workflows/pytorch-win-vs2019-cpu-py3.yml      |  2 ++
 .../pytorch-win-vs2019-cuda10-cudnn7-py3.yml          |  2 ++
 .../pytorch-win-vs2019-cuda11-cudnn8-py3.yml          |  2 ++
 7 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 056dabeaaa4de..a6a6f1b85635d 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -28,6 +28,7 @@ def PyTorchWindowsWorkflow(
     test_runner_type: str,
     cuda_version: str,
     on_pull_request: bool = False,
+    enable_force_on_cpu_test: YamlShellBool = "''",
     only_build_on_pull_request: bool = False,
     num_test_shards: int = 1,
     is_scheduled: Optional[str] = None,
@@ -37,6 +38,7 @@ def PyTorchWindowsWorkflow(
         "test_runner_type": test_runner_type,
         "cuda_version": cuda_version,
         "on_pull_request": on_pull_request,
+        "enable_force_on_cpu_test": enable_force_on_cpu_test,
         "only_build_on_pull_request": only_build_on_pull_request and on_pull_request,
         "is_scheduled": is_scheduled,
         "num_test_shards": num_test_shards,
@@ -106,6 +108,7 @@ def generate_workflow_file(
         test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
         on_pull_request=True,
         num_test_shards=2,
+        enable_force_on_cpu_test=1,
     ),
     PyTorchWindowsWorkflow(
         build_environment="pytorch-win-vs2019-cuda11-cudnn8-py3",
diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py
index 70707c4e99f6f..193d1c7412158 100755
--- a/.github/scripts/generate_pytorch_test_matrix.py
+++ b/.github/scripts/generate_pytorch_test_matrix.py
@@ -30,10 +30,13 @@ def main() -> None:
         configs['jit_legacy'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     if MULTIGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_MULTIGPU_TEST'):
         configs['multigpu'] = {'num_shards': 1, 'runner': MULTIGPU_RUNNER_TYPE}
-    if NOGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_NOGPU_NO_AVX_TEST'):
-        configs['nogpu_NO_AVX'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
-    if NOGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_NOGPU_NO_AVX2_TEST'):
-        configs['nogpu_NO_AVX2'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
+    if NOGPU_RUNNER_TYPE is not None:
+        if os.getenv('ENABLE_NOGPU_NO_AVX_TEST'):
+            configs['nogpu_NO_AVX'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
+        if os.getenv('ENABLE_NOGPU_NO_AVX2_TEST'):
+            configs['nogpu_NO_AVX2'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
+        if os.getenv('ENABLE_FORCE_ON_CPU_TEST'):
+            configs['force_on_cpu'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
     if os.getenv('ENABLE_SLOW_TEST'):
         configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     matrix = {
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index 47234ccb986c7..df2bff1c9474e 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -108,6 +108,8 @@ jobs:
     env:
       TEST_RUNNER_TYPE: !{{ test_runner_type }}
       NUM_TEST_SHARDS: !{{ num_test_shards }}
+      NOGPU_RUNNER_TYPE: windows.4xlarge
+      ENABLE_FORCE_ON_CPU_TEST: !{{ enable_force_on_cpu_test }}
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
diff --git a/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml b/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml
index 3af28a3bce87e..21aa5ddbebf0d 100644
--- a/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml
+++ b/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml
@@ -91,6 +91,8 @@ jobs:
     env:
       TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu
       NUM_TEST_SHARDS: 2
+      NOGPU_RUNNER_TYPE: windows.4xlarge
+      ENABLE_FORCE_ON_CPU_TEST: ''
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
diff --git a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
index 753c053304f61..792ac2c4294ba 100644
--- a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
@@ -84,6 +84,8 @@ jobs:
     env:
       TEST_RUNNER_TYPE: windows.4xlarge
       NUM_TEST_SHARDS: 2
+      NOGPU_RUNNER_TYPE: windows.4xlarge
+      ENABLE_FORCE_ON_CPU_TEST: ''
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
diff --git a/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml b/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
index ef6e69caff82a..d3f8f15f19a7e 100644
--- a/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
@@ -94,6 +94,8 @@ jobs:
     env:
       TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu
       NUM_TEST_SHARDS: 2
+      NOGPU_RUNNER_TYPE: windows.4xlarge
+      ENABLE_FORCE_ON_CPU_TEST: 1
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
diff --git a/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml b/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
index 7580ba180d886..2ed58c4a6b99b 100644
--- a/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
@@ -93,6 +93,8 @@ jobs:
     env:
       TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu
       NUM_TEST_SHARDS: 2
+      NOGPU_RUNNER_TYPE: windows.4xlarge
+      ENABLE_FORCE_ON_CPU_TEST: ''
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}

From 2aedd17661c600758228b694be061d4ca7906357 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Mon, 12 Jul 2021 11:12:39 -0700
Subject: [PATCH 100/122] .circleci: Remove force_on_cpu jobs from circleci
 (#61473)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61473

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Test Plan: Imported from OSS

Reviewed By: walterddr

Differential Revision: D29642891

Pulled By: seemethere

fbshipit-source-id: d51bb859bc28efe15618d1e65f1a1cee64d60508
---
 .../cimodel/data/windows_build_definitions.py | 16 ++---
 .circleci/config.yml                          | 58 -------------------
 2 files changed, 9 insertions(+), 65 deletions(-)

diff --git a/.circleci/cimodel/data/windows_build_definitions.py b/.circleci/cimodel/data/windows_build_definitions.py
index b02104ecc2a08..f45eaaa11b4d8 100644
--- a/.circleci/cimodel/data/windows_build_definitions.py
+++ b/.circleci/cimodel/data/windows_build_definitions.py
@@ -1,6 +1,7 @@
+# TODO: Delete this file after we get re-run with SSH on windows for GHA
+
 import cimodel.lib.miniutils as miniutils
 from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN, NON_PR_BRANCH_LIST
-from cimodel.data.simple.util.versions import CudaVersion
 
 
 class WindowsJob:
@@ -143,18 +144,19 @@ def prefixed_year(self):
     def render(self):
         return "_".join(self.get_elements())
 
-_VC2019 = VcSpec(2019)
 
 WORKFLOW_DATA = [
-    # VS2019 CUDA-10.1
-    WindowsJob(None, _VC2019, CudaVersion(10, 1), master_only=True),
-    # VS2019 CUDA-10.1 force on cpu
-    WindowsJob(1, _VC2019, CudaVersion(10, 1), force_on_cpu=True, master_only=True),
-
     # TODO: This test is disabled due to https://github.com/pytorch/pytorch/issues/59724
     # WindowsJob('_azure_multi_gpu', _VC2019, CudaVersion(11, 1), multi_gpu=True, master_and_nightly=True),
 ]
 
+# NOTE: For users looking to re-run windows builds with SSH uncomment the following lines
+# from cimodel.data.simple.util.versions import CudaVersion
+# _VC2019 = VcSpec(2019)
+# WORKFLOW_DATA.extend([
+#     WindowsJob(None, _VC2019, CudaVersion(10, 1)),
+#     WindowsJob(1, _VC2019, CudaVersion(10, 1), force_on_cpu=True),
+# ])
 
 def get_windows_workflows():
     return [item.gen_tree() for item in WORKFLOW_DATA]
diff --git a/.circleci/config.yml b/.circleci/config.yml
index e24fbebc3d558..a6421d5507220 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7616,41 +7616,6 @@ workflows:
             branches:
               only:
                 - postnightly
-      - pytorch_windows_build:
-          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
-          cuda_version: "10.1"
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: pytorch_windows_vs2019_py38_cuda10.1_build
-          python_version: "3.8"
-          use_cuda: "1"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
-          vs_version: "16.8.6"
-      - pytorch_windows_test:
-          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
-          cuda_version: "10.1"
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: pytorch_windows_vs2019_py38_cuda10.1_on_cpu_test1
-          python_version: "3.8"
-          requires:
-            - pytorch_windows_vs2019_py38_cuda10.1_build
-          test_name: pytorch-windows-test1
-          use_cuda: "0"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
-          vs_version: "16.8.6"
       - update_s3_htmls:
           context: org-member
           filters:
@@ -9182,29 +9147,6 @@ workflows:
           name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test
           requires:
             - binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
-      - pytorch_windows_build:
-          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
-          cuda_version: "10.1"
-          name: pytorch_windows_vs2019_py38_cuda10.1_build
-          python_version: "3.8"
-          use_cuda: "1"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
-          vs_version: "16.8.6"
-      - pytorch_windows_test:
-          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
-          cuda_version: "10.1"
-          name: pytorch_windows_vs2019_py38_cuda10.1_on_cpu_test1
-          python_version: "3.8"
-          requires:
-            - pytorch_windows_vs2019_py38_cuda10.1_build
-          test_name: pytorch-windows-test1
-          use_cuda: "0"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
-          vs_version: "16.8.6"
     when: << pipeline.parameters.run_master_build >>
   slow_gradcheck_build:
     jobs:

From a5a10fe353bef5f9c79c5482ececc6ab1a21447e Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Mon, 12 Jul 2021 11:20:12 -0700
Subject: [PATCH 101/122] Move all downloading logic out of common_utils.py
 (#61479)

Summary:
and into tools/ folder

Currently run_tests.py invokes tools/test_selections.py
1. download and analyze what test_file to run
2. download and parse S3 stats and pass the info to local files.
3. common_utils.py uses download S3 stats to determine what test cases to run.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61479

Reviewed By: janeyx99

Differential Revision: D29661986

Pulled By: walterddr

fbshipit-source-id: bebd8c474bcc2444e135bfd2fa4bdd1eefafe595
---
 .gitignore                              |   3 +-
 test/run_test.py                        |   8 +-
 tools/stats/import_test_stats.py        |  83 ++++++++++++++++
 tools/testing/test_selections.py        |   9 ++
 torch/testing/_internal/common_utils.py | 120 ++++++------------------
 5 files changed, 130 insertions(+), 93 deletions(-)
 create mode 100644 tools/stats/import_test_stats.py

diff --git a/.gitignore b/.gitignore
index b0beb080646e4..c15111dda73ab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,8 +16,9 @@ coverage.xml
 .mypy_cache
 /.extracted_scripts/
 **/.pytorch_specified_test_cases.csv
-**/.pytorch-test-times.json
+**/.pytorch-disabled-tests.json
 **/.pytorch-slow-tests.json
+**/.pytorch-test-times.json
 */*.pyc
 */*.so*
 */**/__pycache__
diff --git a/test/run_test.py b/test/run_test.py
index ae8e1cfb5548f..28c10d51451f7 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -25,7 +25,8 @@
         get_shard_based_on_S3,
         get_slow_tests_based_on_S3,
         get_specified_test_cases,
-        get_reordered_tests
+        get_reordered_tests,
+        get_test_case_configs,
     )
     HAVE_TEST_SELECTION_TOOLS = True
 except ImportError:
@@ -451,6 +452,9 @@ def run_test(test_module, test_directory, options, launcher_cmd=None, extra_unit
     # If using pytest, replace -f with equivalent -x
     if options.pytest:
         unittest_args = [arg if arg != '-f' else '-x' for arg in unittest_args]
+    elif IS_IN_CI:
+        # use the downloaded test cases configuration, not supported in pytest
+        unittest_args.extend(['--import-slow-tests', '--import-disabled-tests'])
 
     # Multiprocessing related tests cannot run with coverage.
     # Tracking issue: https://github.com/pytorch/pytorch/issues/50661
@@ -1044,6 +1048,8 @@ def main():
 
     if IS_IN_CI:
         selected_tests = get_reordered_tests(selected_tests, ENABLE_PR_HISTORY_REORDERING)
+        # downloading test cases configuration to local environment
+        get_test_case_configs(dirpath=os.path.dirname(os.path.abspath(__file__)))
 
     has_failed = False
     failure_messages = []
diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py
new file mode 100644
index 0000000000000..8de3a461155ac
--- /dev/null
+++ b/tools/stats/import_test_stats.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+import datetime
+import json
+import os
+import pathlib
+import re
+from typing import Any, Callable, Dict, Optional, cast
+from urllib.request import urlopen
+
+SLOW_TESTS_FILE = '.pytorch-slow-tests.json'
+DISABLED_TESTS_FILE = '.pytorch-disabled-tests.json'
+
+FILE_CACHE_LIFESPAN_SECONDS = datetime.timedelta(hours=3).seconds
+
+def fetch_and_cache(
+    dirpath: str,
+    name: str,
+    url: str,
+    process_fn: Callable[[Dict[str, Any]], Dict[str, Any]]
+) -> Dict[str, Any]:
+    """
+    This fetch and cache utils allows sharing between different process.
+    """
+    path = os.path.join(dirpath, name)
+
+    def is_cached_file_valid() -> bool:
+        # Check if the file is new enough (see: FILE_CACHE_LIFESPAN_SECONDS). A real check
+        # could make a HEAD request and check/store the file's ETag
+        fname = pathlib.Path(path)
+        now = datetime.datetime.now()
+        mtime = datetime.datetime.fromtimestamp(fname.stat().st_mtime)
+        diff = now - mtime
+        return diff.total_seconds() < FILE_CACHE_LIFESPAN_SECONDS
+
+    if os.path.exists(path) and is_cached_file_valid():
+        # Another test process already downloaded the file, so don't re-do it
+        with open(path, "r") as f:
+            return cast(Dict[str, Any], json.load(f))
+    try:
+        contents = urlopen(url, timeout=1).read().decode('utf-8')
+        processed_contents = process_fn(json.loads(contents))
+        with open(path, "w") as f:
+            f.write(json.dumps(processed_contents))
+        return processed_contents
+    except Exception as e:
+        print(f'Could not download {url} because of error {e}.')
+        return {}
+
+
+def get_slow_tests(dirpath: str, filename: str = SLOW_TESTS_FILE) -> Optional[Dict[str, float]]:
+    url = "https://raw.githubusercontent.com/pytorch/test-infra/master/stats/slow-tests.json"
+    try:
+        return fetch_and_cache(dirpath, filename, url, lambda x: x)
+    except Exception:
+        print("Couldn't download slow test set, leaving all tests enabled...")
+        return {}
+
+
+def get_disabled_tests(dirpath: str, filename: str = DISABLED_TESTS_FILE) -> Optional[Dict[str, Any]]:
+    def process_disabled_test(the_response: Dict[str, Any]) -> Dict[str, Any]:
+        disabled_test_from_issues = dict()
+        for item in the_response['items']:
+            title = item['title']
+            key = 'DISABLED '
+            if title.startswith(key):
+                test_name = title[len(key):].strip()
+                body = item['body']
+                platforms_to_skip = []
+                key = 'platforms:'
+                for line in body.splitlines():
+                    line = line.lower()
+                    if line.startswith(key):
+                        pattern = re.compile(r"^\s+|\s*,\s*|\s+$")
+                        platforms_to_skip.extend([x for x in pattern.split(line[len(key):]) if x])
+                disabled_test_from_issues[test_name] = (item['html_url'], platforms_to_skip)
+        return disabled_test_from_issues
+    try:
+        url = 'https://raw.githubusercontent.com/pytorch/test-infra/master/stats/disabled-tests.json'
+        return fetch_and_cache(dirpath, filename, url, process_disabled_test)
+    except Exception:
+        print("Couldn't download test skip set, leaving all tests enabled...")
+        return {}
diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py
index c504f87a85f84..41e32e87bfdfc 100644
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@@ -8,6 +8,10 @@
     get_previous_reports_for_pr,
     Report, Version2Report,
     HAVE_BOTO3)
+from tools.stats.import_test_stats import (
+    get_disabled_tests,
+    get_slow_tests
+)
 
 from typing import Any, Dict, List, Optional, Tuple, cast
 from typing_extensions import TypedDict
@@ -284,3 +288,8 @@ def export_S3_test_times(test_times_filename: Optional[str] = None) -> Dict[str,
             json.dump(job_times_json, file, indent='    ', separators=(',', ': '))
             file.write('\n')
     return test_times
+
+
+def get_test_case_configs(dirpath: str) -> None:
+    get_slow_tests(dirpath=dirpath)
+    get_disabled_tests(dirpath=dirpath)
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 6991e9f893069..6926afd44f47e 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -23,7 +23,6 @@
 import random
 import contextlib
 import shutil
-import datetime
 import pathlib
 import socket
 import subprocess
@@ -37,7 +36,6 @@
 from numbers import Number
 import tempfile
 import json
-from urllib.request import urlopen
 import __main__  # type: ignore[import]
 import errno
 from typing import cast, Any, Dict, Iterable, Iterator, Optional, Union
@@ -69,6 +67,12 @@
 IS_FBCODE = os.getenv('PYTORCH_TEST_FBCODE') == '1'
 IS_REMOTE_GPU = os.getenv('PYTORCH_TEST_REMOTE_GPU') == '1'
 
+DISABLED_TESTS_FILE = '.pytorch-disabled-tests.json'
+SLOW_TESTS_FILE = '.pytorch-slow-tests.json'
+
+slow_tests_dict: Optional[Dict[str, Any]] = None
+disabled_tests_dict: Optional[Dict[str, Any]] = None
+
 class ProfilingMode(Enum):
     LEGACY = 1
     SIMPLE = 2
@@ -164,6 +168,8 @@ def _get_test_report_path():
 parser.add_argument('--discover-tests', action='store_true')
 parser.add_argument('--log-suffix', type=str, default="")
 parser.add_argument('--run-parallel', type=int, default=1)
+parser.add_argument('--import-slow-tests', type=str, nargs='?', const=SLOW_TESTS_FILE)
+parser.add_argument('--import-disabled-tests', type=str, nargs='?', const=DISABLED_TESTS_FILE)
 
 args, remaining = parser.parse_known_args()
 if args.jit_executor == 'legacy':
@@ -177,6 +183,8 @@ def _get_test_report_path():
     GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode()
 
 
+IMPORT_SLOW_TESTS = args.import_slow_tests
+IMPORT_DISABLED_TESTS = args.import_disabled_tests
 LOG_SUFFIX = args.log_suffix
 RUN_PARALLEL = args.run_parallel
 TEST_BAILOUTS = args.test_bailouts
@@ -263,6 +271,16 @@ def sanitize_test_filename(filename):
     return re.sub('/', r'.', strip_py)
 
 def run_tests(argv=UNITTEST_ARGS):
+    # import test files.
+    if IMPORT_SLOW_TESTS:
+        global slow_tests_dict
+        with open(IMPORT_SLOW_TESTS, 'r') as fp:
+            slow_tests_dict = json.load(fp)
+    if IMPORT_DISABLED_TESTS:
+        global disabled_tests_dict
+        with open(IMPORT_DISABLED_TESTS, 'r') as fp:
+            disabled_tests_dict = json.load(fp)
+    # Determine the test launch mechanism
     if TEST_DISCOVER:
         suite = unittest.TestLoader().loadTestsFromModule(__main__)
         test_cases = discover_test_cases_recursively(suite)
@@ -842,93 +860,16 @@ def settings(*args, **kwargs):
 except ImportError:
     print('Fail to import hypothesis in common_utils, tests are not derandomized')
 
-
-FILE_CACHE_LIFESPAN_SECONDS = datetime.timedelta(hours=3).seconds
-
-def fetch_and_cache(name: str, url: str):
-    """
-    Some tests run in a different process so globals like `slow_test_dict` won't
-    always be filled even though the test file was already downloaded on this
-    machine, so cache it on disk
-    """
-    path = os.path.join(tempfile.gettempdir(), name)
-
-    def is_cached_file_valid():
-        # Check if the file is new enough (say 1 hour for now). A real check
-        # could make a HEAD request and check/store the file's ETag
-        fname = pathlib.Path(path)
-        now = datetime.datetime.now()
-        mtime = datetime.datetime.fromtimestamp(fname.stat().st_mtime)
-        diff = now - mtime
-        return diff.total_seconds() < FILE_CACHE_LIFESPAN_SECONDS
-
-    if os.path.exists(path) and is_cached_file_valid():
-        # Another test process already downloaded the file, so don't re-do it
-        with open(path, "r") as f:
-            return json.load(f)
-    try:
-        contents = urlopen(url, timeout=1).read().decode('utf-8')
-        with open(path, "w") as f:
-            f.write(contents)
-        return json.loads(contents)
-    except Exception as e:
-        print(f'Could not download {url} because of error {e}.')
-        return {}
-
-
-slow_tests_dict: Optional[Dict[str, float]] = None
-def check_slow_test_from_stats(test):
-    global slow_tests_dict
-    if slow_tests_dict is None:
-        if not IS_SANDCASTLE:
-            url = "https://raw.githubusercontent.com/pytorch/test-infra/master/stats/slow-tests.json"
-            slow_tests_dict = fetch_and_cache(".pytorch-slow-tests.json", url)
-        else:
-            slow_tests_dict = {}
+def check_if_enable(test: unittest.TestCase):
     test_suite = str(test.__class__).split('\'')[1]
     test_name = f'{test._testMethodName} ({test_suite})'
-
-    if test_name in slow_tests_dict:
+    if slow_tests_dict is not None and test_name in slow_tests_dict:
         getattr(test, test._testMethodName).__dict__['slow_test'] = True
         if not TEST_WITH_SLOW:
             raise unittest.SkipTest("test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test")
-
-
-disabled_test_from_issues: Optional[Dict[str, Any]] = None
-def check_disabled(test_name):
-    global disabled_test_from_issues
-    if disabled_test_from_issues is None:
-        _disabled_test_from_issues: Dict = {}
-
-        def read_and_process():
-            url = 'https://raw.githubusercontent.com/pytorch/test-infra/master/stats/disabled-tests.json'
-            the_response = fetch_and_cache(".pytorch-disabled-tests", url)
-            for item in the_response['items']:
-                title = item['title']
-                key = 'DISABLED '
-                if title.startswith(key):
-                    test_name = title[len(key):].strip()
-                    body = item['body']
-                    platforms_to_skip = []
-                    key = 'platforms:'
-                    for line in body.splitlines():
-                        line = line.lower()
-                        if line.startswith(key):
-                            pattern = re.compile(r"^\s+|\s*,\s*|\s+$")
-                            platforms_to_skip.extend([x for x in pattern.split(line[len(key):]) if x])
-                    _disabled_test_from_issues[test_name] = (item['html_url'], platforms_to_skip)
-
-        if not IS_SANDCASTLE and os.getenv("PYTORCH_RUN_DISABLED_TESTS", "0") != "1":
-            try:
-                read_and_process()
-                disabled_test_from_issues = _disabled_test_from_issues
-            except Exception:
-                print("Couldn't download test skip set, leaving all tests enabled...")
-                disabled_test_from_issues = {}
-
-    if disabled_test_from_issues is not None:
-        if test_name in disabled_test_from_issues:
-            issue_url, platforms = disabled_test_from_issues[test_name]
+    if not IS_SANDCASTLE and disabled_tests_dict is not None:
+        if test_name in disabled_tests_dict:
+            issue_url, platforms = disabled_tests_dict[test_name]
             platform_to_conditional: Dict = {
                 "mac": IS_MACOS,
                 "macos": IS_MACOS,
@@ -940,7 +881,9 @@ def read_and_process():
                     f"Test is disabled because an issue exists disabling it: {issue_url}" +
                     f" for {'all' if platforms == [] else ''}platform(s) {', '.join(platforms)}." +
                     " To enable, set the environment variable PYTORCH_RUN_DISABLED_TESTS=1")
-
+    if TEST_SKIP_FAST:
+        if not getattr(test, test._testMethodName).__dict__.get('slow_test', False):
+            raise unittest.SkipTest("test is fast; we disabled it with PYTORCH_TEST_SKIP_FAST")
 
 # Acquires the comparison dtype, required since isclose
 # requires both inputs have the same dtype, and isclose is not supported
@@ -1106,12 +1049,7 @@ def run(self, result=None):
             result.stop()
 
     def setUp(self):
-        check_slow_test_from_stats(self)
-        if TEST_SKIP_FAST:
-            if not getattr(self, self._testMethodName).__dict__.get('slow_test', False):
-                raise unittest.SkipTest("test is fast; we disabled it with PYTORCH_TEST_SKIP_FAST")
-        check_disabled(str(self))
-
+        check_if_enable(self)
         set_rng_seed(SEED)
 
     @staticmethod

From 5144cc029e6e1c2e229356a31f2f44f59b33781e Mon Sep 17 00:00:00 2001
From: Elton Leander Pinto <eltonpinto@fb.com>
Date: Mon, 12 Jul 2021 11:29:22 -0700
Subject: [PATCH 102/122] Bump docker image tag for clang-tidy (#61545)

Summary:
Fixes recent `clang-diagnostic-errors` on clang-tidy runs

See https://github.com/pytorch/test-infra/pull/59

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61545

Reviewed By: malfet, seemethere

Differential Revision: D29664061

Pulled By: 1ntEgr8

fbshipit-source-id: cca482a8774e34e61919f2298846ae0b479bf224
---
 .github/workflows/lint.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 4d12baa5610f7..27efda7bd04b7 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -312,7 +312,7 @@ jobs:
     runs-on: linux.2xlarge
     container:
       # ubuntu20.04-cuda11.2-py3.8-tidy11
-      image: ghcr.io/pytorch/cilint-clang-tidy:6e81eee1f23596060ac64cfec9b1f4953eb12931
+      image: ghcr.io/pytorch/cilint-clang-tidy:d8f0c777964d0dd8a147360de80aed1a13eb613a
     steps:
       - name: Checkout PyTorch
         uses: actions/checkout@v2

From 255a32425862890cabed06bba6079346e1838339 Mon Sep 17 00:00:00 2001
From: Jeff Hwang <jeffhwang@fb.com>
Date: Mon, 12 Jul 2021 11:39:50 -0700
Subject: [PATCH 103/122] add nesting_level as attribute to pickle for map
 datapipe (#61534)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61534

currently, attribute `nesting_level` on `MapIterDataPipe` is not pickled. this yields `AttributeError` exceptions when multiprocessing with `DataLoader`

this diff adds it as an attribute to pickle

Test Plan: confirmed errors go away after change

Reviewed By: ejguan

Differential Revision: D29648655

fbshipit-source-id: 943b57eaff9712eb7ce92f43cb360acdb3111f2b
---
 torch/utils/data/datapipes/iter/callable.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index 23f7256afbd74..718d07a3a0416 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -93,11 +93,11 @@ def __getstate__(self):
             dill_function = dill.dumps(self.fn)
         else:
             dill_function = self.fn
-        state = (self.datapipe, dill_function, self.args, self.kwargs)
+        state = (self.datapipe, dill_function, self.args, self.kwargs, self.nesting_level)
         return state
 
     def __setstate__(self, state):
-        (self.datapipe, dill_function, self.args, self.kwargs) = state
+        (self.datapipe, dill_function, self.args, self.kwargs, self.nesting_level) = state
         if DILL_AVAILABLE:
             self.fn = dill.loads(dill_function)  # type: ignore[assignment]
         else:

From 68f9819df499b3bbbe7f1095516c3fad71f5b04b Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Mon, 12 Jul 2021 12:42:34 -0700
Subject: [PATCH 104/122] Typo fix (#41121)

Summary:
Description:
- Typo fix in the docstring

Pull Request resolved: https://github.com/pytorch/pytorch/pull/41121

Reviewed By: heitorschueroff

Differential Revision: D29660228

Pulled By: ezyang

fbshipit-source-id: fc2b55683ec5263ff55c3b6652df3e6313e02be2
---
 torch/nn/functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 1c2aa32aa1dc4..9ce5376b60ac9 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -4036,7 +4036,7 @@ def affine_grid(theta: Tensor, size: List[int], align_corners: Optional[bool] =
         Up to version 1.2.0, all grid points along a unit dimension were
         considered arbitrarily to be at ``-1``.
         From version 1.3.0, under ``align_corners = True`` all grid points
-        along a unit dimension are considered to be at ```0``
+        along a unit dimension are considered to be at ``0``
         (the center of the input image).
     """
     if has_torch_function_unary(theta):

From 65ab861ec60ebc967e7b094386639a0e99a237d4 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Mon, 12 Jul 2021 12:49:37 -0700
Subject: [PATCH 105/122] fix mm not correctly report TORCH_CHECK failure issue
 (#61394)

Summary:
fixes https://github.com/pytorch/pytorch/issues/61291.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61394

Reviewed By: zhouzhuojie, seemethere

Differential Revision: D29614208

Pulled By: walterddr

fbshipit-source-id: f49a15dde708e30b06059b47fae1cda7c2c3571c
---
 aten/src/ATen/native/LinearAlgebra.cpp               | 12 ++++++------
 test/test_linalg.py                                  |  5 +++++
 .../_internal/distributed/distributed_test.py        |  7 ++++---
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 4a67bf3240ea2..5b8aff851db35 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -29,6 +29,9 @@ namespace meta {
 TORCH_META_FUNC(addmm)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) {
   TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
   TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
+  TORCH_CHECK(
+      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
 
   auto names = at::namedinference::propagate_names_for_addmm(mat1, mat2, self);
   set_output(0, {mat1.sizes()[0], mat2.sizes()[1]}, {}, self.options(), names);
@@ -42,6 +45,9 @@ TORCH_META_FUNC(addmm)(const Tensor& self, const Tensor& mat1, const Tensor& mat
 TORCH_META_FUNC(mm)(const Tensor & self, const Tensor & mat2) {
   TORCH_CHECK(self.dim() == 2, "self must be a matrix");
   TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
+  TORCH_CHECK(
+      self.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
+      self.sizes()[0], "x", self.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
 
   auto names = at::namedinference::compute_matmul_outnames(self, mat2);
   set_output(0, {self.sizes()[0], mat2.sizes()[1]}, {}, self.options(), names);
@@ -945,12 +951,6 @@ static void addmm_impl_cpu_(
   auto m2_strides = m2.strides();
   auto m2_sizes = m2.sizes();
 
-  // keeping TORCH_CHECKs here because othe mm methods also utilize this impl.
-  // TODO move this to meta once all methods have migrated to structured kernel.
-  TORCH_CHECK(
-      m1_sizes[1] == m2_sizes[0], "mat1 and mat2 shapes cannot be multiplied (",
-      m1_sizes[0], "x", m1_sizes[1], " and ", m2_sizes[0], "x", m2_sizes[1], ")");
-
   TORCH_CHECK(
       self_sizes[0] == m1_sizes[0] && self_sizes[1] == m2_sizes[1],
       "input shape is incompatible with matrix multiplication (",
diff --git a/test/test_linalg.py b/test/test_linalg.py
index fe15f8b6a7eb7..d54c4cea5c7ce 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -6031,6 +6031,11 @@ def test_addmm_sizes(self, device, dtype):
                     m2 = torch.randn(k, m, device=device).to(dtype)
                     self._test_addmm_addmv(torch.addmm, M, m1, m2)
 
+                    m1 = torch.randn(n, k + 1, device=device).to(dtype)
+                    m2 = torch.randn(k, m, device=device).to(dtype)
+                    self.assertRaisesRegex(RuntimeError, f"{n}x{k + 1}.*{k}x{m}", lambda: torch.addmm(M, m1, m2))
+                    self.assertRaisesRegex(RuntimeError, f"{n}x{k + 1}.*{k}x{m}", lambda: torch.mm(m1, m2))
+
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
     @onlyCUDA
     def test_matmul_45724(self, device):
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 6b3a50ba354f7..2d20c50b0f51f 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -7080,6 +7080,7 @@ def test_ddp_multiple_nested_unused_params_err_ignore_params(self):
             # certain parameters.
             self._test_ddp_multiple_nested_unused_params_error(ignore_sparse=True)
 
+        @unittest.skip("See: https://github.com/pytorch/pytorch/issues/61481")
         @unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo',
                          "Only Nccl & Gloo backend support DistributedDataParallel")
         @skip_if_lt_x_gpu(2)
@@ -7097,9 +7098,9 @@ def test_ddp_inference(self):
             syncbn_model = nn.SyncBatchNorm(
                 2, momentum=0.99, track_running_stats=False
             ).cuda()
-            local_syncbn_model = copy.deepcopy(model)
+            local_syncbn_model = copy.deepcopy(syncbn_model)
             syncbn_model = torch.nn.parallel.DistributedDataParallel(
-                model,
+                syncbn_model,
                 device_ids=[rank]
             )
             inp = torch.randn(10, 2, device=rank)
@@ -7118,7 +7119,7 @@ def test_ddp_inference(self):
                                 test_local_model(test_inp)
                             )
 
-                    model.eval()
+                    test_model.eval()
                     for _ in range(6):
                         self.assertEqual(
                             test_model(test_inp),

From 5897a60480cdc7a93eaadd824f7452aa8bf7696a Mon Sep 17 00:00:00 2001
From: Michael Dagitses <mikeyd@fb.com>
Date: Mon, 12 Jul 2021 12:51:12 -0700
Subject: [PATCH 106/122] warn about SVD outputs not supporting backprop
 (#61037)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61037

* **#61037**

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D29491985

Pulled By: dagitses

fbshipit-source-id: 6322e7c86cade52671062ee97d2fcb8c15d8aa86
---
 torch/_torch_docs.py     |  4 ++--
 torch/linalg/__init__.py | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index aeb2e778466cb..b0eaf715a83ee 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -9011,9 +9011,9 @@ def merge_dicts(*dicts):
                default value for both is `True`, so the default behavior is
                effectively the opposite.
              * :func:`torch.svd` returns `V`, whereas :func:`torch.linalg.svd` returns
-               `Vh`, that is, `Vᴴ`.
+               `Vᴴ`.
              * If :attr:`compute_uv` is `False`, :func:`torch.svd` returns zero-filled
-               tensors for `U` and `Vh`, whereas :func:`torch.linalg.svd` returns
+               tensors for `U` and `Vᴴ`, whereas :func:`torch.linalg.svd` returns
                empty tensors.
 
 .. note:: The singular values are returned in descending order. If :attr:`input` is a batch of matrices,
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index b998a062e9951..f897694ed1255 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -1452,7 +1452,7 @@
 Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
 the output has the same batch dimensions.
 
-The returned decomposition is a named tuple `(U, S, Vh)`
+The returned decomposition is a named tuple `(U, S, Vᴴ)`
 which corresponds to :math:`U`, :math:`S`, :math:`V^{\text{H}}` above.
 
 The singular values are returned in descending order.
@@ -1483,7 +1483,7 @@
              In this case, one may multiply the associated singular vectors of `U` and `V` spanning
              the subspace by a rotation matrix and `the resulting vectors will span the same subspace`_.
 
-.. warning:: Gradients computed using `U` or `Vh` will only be finite when
+.. warning:: Gradients computed using `U` or `Vᴴ` will only be finite when
              :attr:`A` does not have zero as a singular value or repeated singular values.
              Furthermore, if the distance between any two singular values is close to zero,
              the gradient will be numerically unstable, as it depends on the singular values
@@ -1512,19 +1512,19 @@
     full_matrices (bool, optional): controls whether to compute the full or reduced
                                     SVD, and consequently,
                                     the shape of the returned tensors
-                                    `U` and `Vh`. Default: `True`.
+                                    `U` and `Vᴴ`. Default: `True`.
 
 Keyword args:
     out (tuple, optional): output tuple of three tensors. Ignored if `None`.
 
 Returns:
-    A named tuple `(U, S, Vh)` which corresponds to :math:`U`, :math:`S`, :math:`V^{\text{H}}` above.
+    A named tuple `(U, S, Vᴴ)` which corresponds to :math:`U`, :math:`S`, :math:`V^{\text{H}}` above.
 
     `S` will always be real-valued, even when :attr:`A` is complex.
     It will also be ordered in descending order.
 
-    `U` and `Vh` will have the same dtype as :attr:`A`. The left / right singular vectors will be given by
-    the columns of `U` and the rows of `Vh` respectively.
+    `U` and `Vᴴ` will have the same dtype as :attr:`A`. The left / right singular vectors will be given by
+    the columns of `U` and the rows of `Vᴴ` respectively.
 
 Examples::
 

From 58df01c3b808876949ecc50f27865357547b58dd Mon Sep 17 00:00:00 2001
From: Michael Dagitses <mikeyd@fb.com>
Date: Mon, 12 Jul 2021 12:51:12 -0700
Subject: [PATCH 107/122] clarify default value of requires_grad for tensors
 (#61038)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61038

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D29491984

Pulled By: dagitses

fbshipit-source-id: 7e6b7f8e81d77f38c881b86a68c17d3cf5483dad
---
 docs/source/notes/autograd.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/notes/autograd.rst b/docs/source/notes/autograd.rst
index 6d0e0e83d3d2b..0c1eed3f42457 100644
--- a/docs/source/notes/autograd.rst
+++ b/docs/source/notes/autograd.rst
@@ -56,9 +56,10 @@ to disable gradient computation but, because of its name, is often mixed up with
 Setting ``requires_grad``
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
-:attr:`requires_grad` is a flag that allows for fine-grained exclusion of
-subgraphs from gradient computation. It takes effect in both the forward
-and backward passes:
+:attr:`requires_grad` is a flag, defaulting to false *unless wrapped
+in a ``nn.Parameter``*, that allows for fine-grained exclusion of
+subgraphs from gradient computation. It takes effect in both the
+forward and backward passes:
 
 During the forward pass, an operation is only recorded in the backward graph if
 at least one of its input tensors require grad.

From 7fdce39a4b735c86ee99330f6bb1decbe3492d76 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Mon, 12 Jul 2021 13:57:42 -0700
Subject: [PATCH 108/122] Revert D29642891: .circleci: Remove force_on_cpu jobs
 from circleci

Test Plan: revert-hammer

Differential Revision:
D29642891 (https://github.com/pytorch/pytorch/commit/2aedd17661c600758228b694be061d4ca7906357)

Original commit changeset: d51bb859bc28

fbshipit-source-id: a39a2d57d6e68961d94d4137a57bdc280f9b1b5b
---
 .../cimodel/data/windows_build_definitions.py | 16 +++--
 .circleci/config.yml                          | 58 +++++++++++++++++++
 2 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/.circleci/cimodel/data/windows_build_definitions.py b/.circleci/cimodel/data/windows_build_definitions.py
index f45eaaa11b4d8..b02104ecc2a08 100644
--- a/.circleci/cimodel/data/windows_build_definitions.py
+++ b/.circleci/cimodel/data/windows_build_definitions.py
@@ -1,7 +1,6 @@
-# TODO: Delete this file after we get re-run with SSH on windows for GHA
-
 import cimodel.lib.miniutils as miniutils
 from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN, NON_PR_BRANCH_LIST
+from cimodel.data.simple.util.versions import CudaVersion
 
 
 class WindowsJob:
@@ -144,19 +143,18 @@ def prefixed_year(self):
     def render(self):
         return "_".join(self.get_elements())
 
+_VC2019 = VcSpec(2019)
 
 WORKFLOW_DATA = [
+    # VS2019 CUDA-10.1
+    WindowsJob(None, _VC2019, CudaVersion(10, 1), master_only=True),
+    # VS2019 CUDA-10.1 force on cpu
+    WindowsJob(1, _VC2019, CudaVersion(10, 1), force_on_cpu=True, master_only=True),
+
     # TODO: This test is disabled due to https://github.com/pytorch/pytorch/issues/59724
     # WindowsJob('_azure_multi_gpu', _VC2019, CudaVersion(11, 1), multi_gpu=True, master_and_nightly=True),
 ]
 
-# NOTE: For users looking to re-run windows builds with SSH uncomment the following lines
-# from cimodel.data.simple.util.versions import CudaVersion
-# _VC2019 = VcSpec(2019)
-# WORKFLOW_DATA.extend([
-#     WindowsJob(None, _VC2019, CudaVersion(10, 1)),
-#     WindowsJob(1, _VC2019, CudaVersion(10, 1), force_on_cpu=True),
-# ])
 
 def get_windows_workflows():
     return [item.gen_tree() for item in WORKFLOW_DATA]
diff --git a/.circleci/config.yml b/.circleci/config.yml
index a6421d5507220..e24fbebc3d558 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7616,6 +7616,41 @@ workflows:
             branches:
               only:
                 - postnightly
+      - pytorch_windows_build:
+          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
+          cuda_version: "10.1"
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
+          name: pytorch_windows_vs2019_py38_cuda10.1_build
+          python_version: "3.8"
+          use_cuda: "1"
+          vc_product: BuildTools
+          vc_version: ""
+          vc_year: "2019"
+          vs_version: "16.8.6"
+      - pytorch_windows_test:
+          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
+          cuda_version: "10.1"
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
+          name: pytorch_windows_vs2019_py38_cuda10.1_on_cpu_test1
+          python_version: "3.8"
+          requires:
+            - pytorch_windows_vs2019_py38_cuda10.1_build
+          test_name: pytorch-windows-test1
+          use_cuda: "0"
+          vc_product: BuildTools
+          vc_version: ""
+          vc_year: "2019"
+          vs_version: "16.8.6"
       - update_s3_htmls:
           context: org-member
           filters:
@@ -9147,6 +9182,29 @@ workflows:
           name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test
           requires:
             - binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
+      - pytorch_windows_build:
+          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
+          cuda_version: "10.1"
+          name: pytorch_windows_vs2019_py38_cuda10.1_build
+          python_version: "3.8"
+          use_cuda: "1"
+          vc_product: BuildTools
+          vc_version: ""
+          vc_year: "2019"
+          vs_version: "16.8.6"
+      - pytorch_windows_test:
+          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
+          cuda_version: "10.1"
+          name: pytorch_windows_vs2019_py38_cuda10.1_on_cpu_test1
+          python_version: "3.8"
+          requires:
+            - pytorch_windows_vs2019_py38_cuda10.1_build
+          test_name: pytorch-windows-test1
+          use_cuda: "0"
+          vc_product: BuildTools
+          vc_version: ""
+          vc_year: "2019"
+          vs_version: "16.8.6"
     when: << pipeline.parameters.run_master_build >>
   slow_gradcheck_build:
     jobs:

From 2fd37a830ede2c01e498ed103f62fd3421c01b5d Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Mon, 12 Jul 2021 13:57:42 -0700
Subject: [PATCH 109/122] Revert D29642893: .github: Add force_on_cpu tests for
 windows

Test Plan: revert-hammer

Differential Revision:
D29642893 (https://github.com/pytorch/pytorch/commit/a52de0dfec4eeb288882cdea683a212f03cf3fc7)

Original commit changeset: 2dd2b295c71d

fbshipit-source-id: c01c421689f6d01cdfb3fe60a8c6428253249c5f
---
 .github/scripts/generate_ci_workflows.py              |  3 ---
 .github/scripts/generate_pytorch_test_matrix.py       | 11 ++++-------
 .github/templates/windows_ci_workflow.yml.j2          |  2 --
 .../periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml |  2 --
 .github/workflows/pytorch-win-vs2019-cpu-py3.yml      |  2 --
 .../pytorch-win-vs2019-cuda10-cudnn7-py3.yml          |  2 --
 .../pytorch-win-vs2019-cuda11-cudnn8-py3.yml          |  2 --
 7 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index a6a6f1b85635d..056dabeaaa4de 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -28,7 +28,6 @@ def PyTorchWindowsWorkflow(
     test_runner_type: str,
     cuda_version: str,
     on_pull_request: bool = False,
-    enable_force_on_cpu_test: YamlShellBool = "''",
     only_build_on_pull_request: bool = False,
     num_test_shards: int = 1,
     is_scheduled: Optional[str] = None,
@@ -38,7 +37,6 @@ def PyTorchWindowsWorkflow(
         "test_runner_type": test_runner_type,
         "cuda_version": cuda_version,
         "on_pull_request": on_pull_request,
-        "enable_force_on_cpu_test": enable_force_on_cpu_test,
         "only_build_on_pull_request": only_build_on_pull_request and on_pull_request,
         "is_scheduled": is_scheduled,
         "num_test_shards": num_test_shards,
@@ -108,7 +106,6 @@ def generate_workflow_file(
         test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
         on_pull_request=True,
         num_test_shards=2,
-        enable_force_on_cpu_test=1,
     ),
     PyTorchWindowsWorkflow(
         build_environment="pytorch-win-vs2019-cuda11-cudnn8-py3",
diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py
index 193d1c7412158..70707c4e99f6f 100755
--- a/.github/scripts/generate_pytorch_test_matrix.py
+++ b/.github/scripts/generate_pytorch_test_matrix.py
@@ -30,13 +30,10 @@ def main() -> None:
         configs['jit_legacy'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     if MULTIGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_MULTIGPU_TEST'):
         configs['multigpu'] = {'num_shards': 1, 'runner': MULTIGPU_RUNNER_TYPE}
-    if NOGPU_RUNNER_TYPE is not None:
-        if os.getenv('ENABLE_NOGPU_NO_AVX_TEST'):
-            configs['nogpu_NO_AVX'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
-        if os.getenv('ENABLE_NOGPU_NO_AVX2_TEST'):
-            configs['nogpu_NO_AVX2'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
-        if os.getenv('ENABLE_FORCE_ON_CPU_TEST'):
-            configs['force_on_cpu'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
+    if NOGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_NOGPU_NO_AVX_TEST'):
+        configs['nogpu_NO_AVX'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
+    if NOGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_NOGPU_NO_AVX2_TEST'):
+        configs['nogpu_NO_AVX2'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
     if os.getenv('ENABLE_SLOW_TEST'):
         configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     matrix = {
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index df2bff1c9474e..47234ccb986c7 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -108,8 +108,6 @@ jobs:
     env:
       TEST_RUNNER_TYPE: !{{ test_runner_type }}
       NUM_TEST_SHARDS: !{{ num_test_shards }}
-      NOGPU_RUNNER_TYPE: windows.4xlarge
-      ENABLE_FORCE_ON_CPU_TEST: !{{ enable_force_on_cpu_test }}
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
diff --git a/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml b/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml
index 21aa5ddbebf0d..3af28a3bce87e 100644
--- a/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml
+++ b/.github/workflows/periodic-pytorch-win-vs2019-cuda11-cudnn8-py3.yml
@@ -91,8 +91,6 @@ jobs:
     env:
       TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu
       NUM_TEST_SHARDS: 2
-      NOGPU_RUNNER_TYPE: windows.4xlarge
-      ENABLE_FORCE_ON_CPU_TEST: ''
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
diff --git a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
index 792ac2c4294ba..753c053304f61 100644
--- a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
@@ -84,8 +84,6 @@ jobs:
     env:
       TEST_RUNNER_TYPE: windows.4xlarge
       NUM_TEST_SHARDS: 2
-      NOGPU_RUNNER_TYPE: windows.4xlarge
-      ENABLE_FORCE_ON_CPU_TEST: ''
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
diff --git a/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml b/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
index d3f8f15f19a7e..ef6e69caff82a 100644
--- a/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cuda10-cudnn7-py3.yml
@@ -94,8 +94,6 @@ jobs:
     env:
       TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu
       NUM_TEST_SHARDS: 2
-      NOGPU_RUNNER_TYPE: windows.4xlarge
-      ENABLE_FORCE_ON_CPU_TEST: 1
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
diff --git a/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml b/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
index 2ed58c4a6b99b..7580ba180d886 100644
--- a/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cuda11-cudnn8-py3.yml
@@ -93,8 +93,6 @@ jobs:
     env:
       TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu
       NUM_TEST_SHARDS: 2
-      NOGPU_RUNNER_TYPE: windows.4xlarge
-      ENABLE_FORCE_ON_CPU_TEST: ''
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}

From 4d842d909bb17cd6642670a38864e3d66dfe6de1 Mon Sep 17 00:00:00 2001
From: Joel Schlosser <jbschlosser@fb.com>
Date: Mon, 12 Jul 2021 14:17:49 -0700
Subject: [PATCH 110/122] Revert FC workaround for ReflectionPad3d (#61308)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/61248

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61308

Reviewed By: iramazanli

Differential Revision: D29566849

Pulled By: jbschlosser

fbshipit-source-id: 8ab443ffef7fd9840d64d71afc2f2d2b8a410ddb
---
 test/test_nn.py             |  1 -
 torch/_C/_nn.pyi.in         |  2 --
 torch/nn/functional.py      |  2 +-
 torch/nn/modules/padding.py | 10 ----------
 4 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 398ef70c5613c..d1cefbd2065b1 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -13144,7 +13144,6 @@ def test_ReflectionPad2d_large(self, device):
             self.assertEqual(x.grad, ref_x.grad)
 
     @onlyCUDA   # Test if CPU and GPU results match
-    @unittest.skipIf(True, "temporarily disabled")
     def test_ReflectionPad3d_large(self, device):
         shapes = ([2, 1000, 7, 7, 7], [1000, 2, 7, 7, 7])
         pad = (1, 2, 3, 4, 5, 6)
diff --git a/torch/_C/_nn.pyi.in b/torch/_C/_nn.pyi.in
index 75c7b20d15ff9..b2b2bcbbefdd8 100644
--- a/torch/_C/_nn.pyi.in
+++ b/torch/_C/_nn.pyi.in
@@ -31,5 +31,3 @@ def pad_sequence(sequences: List[Tensor], batch_first: bool = False,
 def flatten_dense_tensors(tensors: List[Tensor]) -> Tensor: ...
 
 def unflatten_dense_tensors(flat: Tensor, tensors: List[Tensor]) -> List[Tensor]: ...
-
-def reflection_pad3d(input: Tensor, pad: Tuple[int, int, int, int, int, int]) -> Tensor: ...
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 9ce5376b60ac9..df75e55a45c16 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -4179,7 +4179,7 @@ def _pad(input: Tensor, pad: List[int], mode: str = "constant", value: float = 0
         elif input.dim() == 5:
             assert len(pad) == 6, "5D tensors expect 6 values for padding"
             if mode == "reflect":
-                raise NotImplementedError
+                return torch._C._nn.reflection_pad3d(input, pad)
             elif mode == "replicate":
                 return torch._C._nn.replication_pad3d(input, pad)
             elif mode == "circular":
diff --git a/torch/nn/modules/padding.py b/torch/nn/modules/padding.py
index b2e13328944b0..3156bbbc72915 100644
--- a/torch/nn/modules/padding.py
+++ b/torch/nn/modules/padding.py
@@ -6,8 +6,6 @@
 from ..common_types import _size_2_t, _size_4_t, _size_6_t
 from typing import Sequence, Tuple
 
-import torch
-
 
 # TODO: grad_output size asserts in THNN
 
@@ -319,14 +317,6 @@ def __init__(self, padding: _size_6_t) -> None:
         super(ReflectionPad3d, self).__init__()
         self.padding = _ntuple(6)(padding)
 
-    # TODO: Remove this forward() implementation and fallback to base implementation
-    # once the FC window for the new op has passed. This hack is temporarily provided
-    # to avoid breaking JIT-serialized models that rely on _pad() but not reflection_pad3d.
-    def forward(self, input: Tensor) -> Tensor:
-        assert len(self.padding) % 2 == 0, "Padding length must be divisible by 2"
-        assert len(self.padding) // 2 <= input.dim(), "Padding length too large"
-        return torch._C._nn.reflection_pad3d(input, self.padding)
-
 
 class _ReplicationPadNd(Module):
     __constants__ = ['padding']

From d3cb065b2fb59cdd715f7e2c55d9246795cd8152 Mon Sep 17 00:00:00 2001
From: Vitaly Fedyunin <vitaly.fedyunin@gmail.com>
Date: Mon, 12 Jul 2021 14:19:44 -0700
Subject: [PATCH 111/122] Implement usage of `is_shardable` and
 `apply_sharding` (#61236)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61236

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D29588835

Pulled By: VitalyFedyunin

fbshipit-source-id: 00c3042f96af498637b2dcf6e3f842c1fc05ddd8
---
 test/test_datapipe.py                       | 28 +++++++++++++++++++++
 torch/utils/data/datapipes/iter/grouping.py | 20 +++++++++++++++
 torch/utils/data/sharding.py                | 26 +++++++++++++++++++
 3 files changed, 74 insertions(+)
 create mode 100644 torch/utils/data/sharding.py

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index fbf4eb9918ed6..302a62558caa9 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -25,6 +25,7 @@
 import torch.nn as nn
 import torch.utils.data.datapipes as dp
 import torch.utils.data.graph
+import torch.utils.data.sharding
 
 from torch.testing._internal.common_utils import (TestCase, run_tests)
 from torch.utils.data import (
@@ -1314,5 +1315,32 @@ def test_traverse_forked(self):
         expected = {combined_dp: {dp0_upd: {dp0: {}}, dp1_upd: {dp1: {}}, dp2: {}}}
         self.assertEqual(expected, graph)
 
+
+class TestSharding(TestCase):
+    @skipIfNoDill
+    def test_simple_sharding(self):
+        def get_pipeline():
+            numbers_dp = NumbersDataset(size=10)
+            dp0, dp1 = numbers_dp.fork(2)
+            dp0_upd = dp0.map(lambda x: x * 10)
+            dp1_upd = dp1.filter(lambda x: x % 3 == 1)
+            combined_dp = dp0_upd.mux(dp1_upd)
+            return combined_dp
+
+        sharded_dp = get_pipeline().sharding_filter()
+        torch.utils.data.sharding.apply_sharding(sharded_dp, 3, 1)
+        items = list(sharded_dp)
+        self.assertEqual([1, 20, 40, 70], items)
+
+        all_items = list(get_pipeline())
+        items = []
+        for i in range(3):
+            sharded_dp = get_pipeline().sharding_filter()
+            torch.utils.data.sharding.apply_sharding(sharded_dp, 3, i)
+            items += list(sharded_dp)
+
+        self.assertEqual(sorted(all_items), sorted(items))
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index 781b59f7aa97e..1013fcaf8aaf3 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -10,6 +10,26 @@
 T_co = TypeVar('T_co', covariant=True)
 
 
+@functional_datapipe('sharding_filter')
+class ShardingFilterIterDataPipe(IterDataPipe):
+    def __init__(self, source_datapipe):
+        self.source_datapipe = source_datapipe
+        self.num_of_instances = 1
+        self.instance_id = 0
+
+    def is_shardable(self):
+        return True
+
+    def apply_sharding(self, num_of_instances, instance_id):
+        self.num_of_instances = num_of_instances
+        self.instance_id = instance_id
+
+    def __iter__(self):
+        for i, item in enumerate(self.source_datapipe):
+            if i % self.num_of_instances == self.instance_id:
+                yield item
+
+
 @functional_datapipe('batch')
 class BatchIterDataPipe(IterDataPipe[List[T_co]]):
     r""" :class:`BatchIterDataPipe`.
diff --git a/torch/utils/data/sharding.py b/torch/utils/data/sharding.py
new file mode 100644
index 0000000000000..b66944dc724c3
--- /dev/null
+++ b/torch/utils/data/sharding.py
@@ -0,0 +1,26 @@
+import torch.utils.data.graph
+
+
+def apply_sharding(datapipe, num_of_instances, instance_id):
+    graph = torch.utils.data.graph.traverse(datapipe)
+
+    def traverse_graph(graph):
+        results = set()
+        for datapipe, sub_graph in graph.items():
+            results.add(datapipe)
+            sub_items = traverse_graph(sub_graph)
+            for item in sub_items:
+                results.add(item)
+        return results
+
+    all_pipes = traverse_graph(graph)
+    already_applied_to = None
+    for pipe in all_pipes:
+        if hasattr(pipe, 'is_shardable'):
+            if pipe.is_shardable():
+                if hasattr(pipe, 'apply_sharding'):
+                    if already_applied_to is not None:
+                        raise RuntimeError('This implementation of sharding can be only applied once per instance of DataPipeline.',
+                                           'Already applied to', already_applied_to, 'while trying to apply to', pipe)
+                    pipe.apply_sharding(num_of_instances, instance_id)
+                    already_applied_to = pipe

From fd13e925ecdd20294c5f28dd5cfa367fc3048c53 Mon Sep 17 00:00:00 2001
From: Vitaly Fedyunin <vitaly.fedyunin@gmail.com>
Date: Mon, 12 Jul 2021 14:52:32 -0700
Subject: [PATCH 112/122] Adding backward compatibility for sharding support in
 old DataLoader (#61237)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61237

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D29588832

Pulled By: VitalyFedyunin

fbshipit-source-id: 3bfa4417f6a04450f656ecf28fc95322d2cf076a
---
 test/test_datapipe.py                      | 36 +++++++++++++++-------
 torch/utils/data/backward_compatibility.py |  8 +++++
 2 files changed, 33 insertions(+), 11 deletions(-)
 create mode 100644 torch/utils/data/backward_compatibility.py

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 302a62558caa9..145513b5de595 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -26,6 +26,7 @@
 import torch.utils.data.datapipes as dp
 import torch.utils.data.graph
 import torch.utils.data.sharding
+import torch.utils.data.backward_compatibility
 
 from torch.testing._internal.common_utils import (TestCase, run_tests)
 from torch.utils.data import (
@@ -1317,30 +1318,43 @@ def test_traverse_forked(self):
 
 
 class TestSharding(TestCase):
+    def _get_pipeline(self):
+        numbers_dp = NumbersDataset(size=10)
+        dp0, dp1 = numbers_dp.fork(2)
+        dp0_upd = dp0.map(lambda x: x * 10)
+        dp1_upd = dp1.filter(lambda x: x % 3 == 1)
+        combined_dp = dp0_upd.mux(dp1_upd)
+        return combined_dp
+
     @skipIfNoDill
     def test_simple_sharding(self):
-        def get_pipeline():
-            numbers_dp = NumbersDataset(size=10)
-            dp0, dp1 = numbers_dp.fork(2)
-            dp0_upd = dp0.map(lambda x: x * 10)
-            dp1_upd = dp1.filter(lambda x: x % 3 == 1)
-            combined_dp = dp0_upd.mux(dp1_upd)
-            return combined_dp
-
-        sharded_dp = get_pipeline().sharding_filter()
+        sharded_dp = self._get_pipeline().sharding_filter()
         torch.utils.data.sharding.apply_sharding(sharded_dp, 3, 1)
         items = list(sharded_dp)
         self.assertEqual([1, 20, 40, 70], items)
 
-        all_items = list(get_pipeline())
+        all_items = list(self._get_pipeline())
         items = []
         for i in range(3):
-            sharded_dp = get_pipeline().sharding_filter()
+            sharded_dp = self._get_pipeline().sharding_filter()
             torch.utils.data.sharding.apply_sharding(sharded_dp, 3, i)
             items += list(sharded_dp)
 
         self.assertEqual(sorted(all_items), sorted(items))
 
+    @skipIfNoDill
+    def test_old_dataloader(self):
+        dp = self._get_pipeline()
+        expected = list(dp)
+
+        dp = self._get_pipeline().sharding_filter()
+        dl = DataLoader(dp, batch_size=1, shuffle=False, num_workers=2,
+                        worker_init_fn=torch.utils.data.backward_compatibility.worker_init_fn)
+        items = []
+        for i in dl:
+            items.append(i)
+
+        self.assertEqual(sorted(expected), sorted(items))
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/utils/data/backward_compatibility.py b/torch/utils/data/backward_compatibility.py
new file mode 100644
index 0000000000000..cc4e4b6bfd13b
--- /dev/null
+++ b/torch/utils/data/backward_compatibility.py
@@ -0,0 +1,8 @@
+import torch.utils.data.sharding
+
+
+def worker_init_fn(worker_id):
+    info = torch.utils.data.get_worker_info()
+    num_workers = info.num_workers
+    datapipe = info.dataset
+    torch.utils.data.sharding.apply_sharding(datapipe, num_workers, worker_id)

From 4ef640d6f6dad4999bd65a420616f71811a50a13 Mon Sep 17 00:00:00 2001
From: Vitaly Fedyunin <vitaly.fedyunin@gmail.com>
Date: Mon, 12 Jul 2021 15:31:19 -0700
Subject: [PATCH 113/122] Sort imports of test_datapipe.py (#61312)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61312

Sorting according to isort output. Alphabetically ordered one per line imports help merging.

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D29588833

Pulled By: VitalyFedyunin

fbshipit-source-id: 4c80c3086132b50894e734ad6c5799d78d689e42
---
 test/test_datapipe.py | 57 +++++++++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 145513b5de595..f335e1a8c3920 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -1,39 +1,57 @@
+import http.server
 import itertools
-import numpy as np
 import os
 import os.path
 import pickle
 import random
+import socketserver
 import sys
 import tarfile
 import tempfile
-import warnings
-import zipfile
-
-import unittest
-from unittest import skipIf
-from typing import (
-    Any, Awaitable, Dict, Generic, Iterator, List, NamedTuple, Optional, Tuple,
-    Type, TypeVar, Set, Union)
-import http.server
-import socketserver
 import threading
 import time
+import unittest
+import warnings
+import zipfile
 from functools import partial
+from typing import (
+    Any,
+    Awaitable,
+    Dict,
+    Generic,
+    Iterator,
+    List,
+    NamedTuple,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+from unittest import skipIf
+
+import numpy as np
 
 import torch
 import torch.nn as nn
+import torch.utils.data.backward_compatibility
 import torch.utils.data.datapipes as dp
 import torch.utils.data.graph
 import torch.utils.data.sharding
-import torch.utils.data.backward_compatibility
-
-from torch.testing._internal.common_utils import (TestCase, run_tests)
+from torch.testing._internal.common_utils import TestCase, run_tests
 from torch.utils.data import (
-    IterDataPipe, MapDataPipe, RandomSampler, DataLoader,
-    argument_validation, runtime_validation_disabled, runtime_validation)
+    DataLoader,
+    IterDataPipe,
+    MapDataPipe,
+    RandomSampler,
+    argument_validation,
+    runtime_validation,
+    runtime_validation_disabled,
+)
 from torch.utils.data.datapipes.utils.decoder import (
-    basichandlers as decoder_basichandlers)
+    basichandlers as decoder_basichandlers,
+)
 
 try:
     import torchvision.transforms
@@ -126,7 +144,10 @@ def test_listdirfiles_iterable_datapipe(self):
 
     def test_loadfilesfromdisk_iterable_datapipe(self):
         # test import datapipe class directly
-        from torch.utils.data.datapipes.iter import ListDirFiles, LoadFilesFromDisk
+        from torch.utils.data.datapipes.iter import (
+            ListDirFiles,
+            LoadFilesFromDisk,
+        )
 
         temp_dir = self.temp_dir.name
         datapipe1 = ListDirFiles(temp_dir, '')

From 8a2c7d902fb159bb57a0cf46f3a5ba74095cc34a Mon Sep 17 00:00:00 2001
From: Don Jang <djang@fb.com>
Date: Mon, 12 Jul 2021 18:02:33 -0700
Subject: [PATCH 114/122] [static runtime] Add DCHECK to ensure that outputs do
 not overlap with immutable inputs (#61301)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61301

This change adds a `DCHECK` to ensure that outputs do not overlap with immutable inputs.

Test Plan:
Added unittests as follows:

- `ProcessedNode.VerifyOutputsNotOverlappingWithImmutableInputsWithImmutableArguments`
- `ProcessedNode.VerifyOutputsNotOverlappingWithImmutableInputsWithMutableArguments`

Reviewed By: hlu1

Differential Revision: D29564158

fbshipit-source-id: bf14b4978ab544af79010cf724ed28202b4521cc
---
 .../static_runtime/test_static_runtime.cc     | 46 +++++++++++++++++++
 torch/csrc/jit/runtime/static/impl.cpp        | 27 +++++++++++
 torch/csrc/jit/runtime/static/impl.h          |  2 +
 3 files changed, 75 insertions(+)

diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 01935427ec3dd..b4e27e9b909c5 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -168,6 +168,16 @@ bool testHasInplaceOp(const std::string& jit_script) {
   torch::jit::AliasDb alias_db(graph);
   return torch::jit::HasInplaceOp(graph, alias_db);
 }
+
+static Node* getNodeWithKind(const torch::jit::StaticModule& smodule, const string& kind) {
+  for (auto& pnode : smodule.nodes()) {
+    if (std::string(pnode.node()->kind().toQualString()) == kind) {
+      return pnode.node();
+    }
+  }
+  return nullptr;
+}
+
 } // namespace
 
 TEST(StaticRuntime, InPlace) {
@@ -857,3 +867,39 @@ TEST(StaticRuntime, FusionPass) {
     }
   }
 }
+
+TEST(ProcessedNode, VerifyOutputsNotOverlappingWithImmutableInputsWithImmutableArguments) {
+  script::Module module("module");
+  // Not using out= variant.
+  module.define(sigmoid_script);
+  torch::jit::StaticModule smodule(module);
+  Node* sigmoid_node = getNodeWithKind(smodule, "aten::sigmoid");
+  const at::IValue a = torch::randn({2, 3});
+  at::IValue b = torch::randn({3, 1});
+  std::vector<const IValue*> ivalue_inputs{&a};
+  ProcessedNode pnode(sigmoid_node, std::move(ivalue_inputs), true);
+
+  pnode.Output(0) = b;
+  EXPECT_TRUE(pnode.verify_outputs_not_overlapping_with_immutable_inputs());
+
+  pnode.Output(0) = a;
+  EXPECT_FALSE(pnode.verify_outputs_not_overlapping_with_immutable_inputs());
+}
+
+TEST(ProcessedNode, VerifyOutputsNotOverlappingWithImmutableInputsWithMutableArguments) {
+  script::Module module("module");
+  // Using out= variant.
+  module.define(sigmoid_inplace_script);
+  torch::jit::StaticModule smodule(module);
+  Node* sigmoid_node = getNodeWithKind(smodule, "aten::sigmoid");
+  const at::IValue a = torch::randn({2, 3});
+  at::IValue b = torch::randn({3, 1});
+  std::vector<const IValue*> ivalue_inputs{&a};
+  ProcessedNode pnode(sigmoid_node, std::move(ivalue_inputs), true);
+
+  pnode.Output(0) = b;
+  EXPECT_TRUE(pnode.verify_outputs_not_overlapping_with_immutable_inputs());
+
+  pnode.Output(0) = a;
+  EXPECT_TRUE(pnode.verify_outputs_not_overlapping_with_immutable_inputs());
+}
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 4f796ffa56930..3b1468c50cf35 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/runtime/static/impl.h>
 
+#include <ATen/MemoryOverlap.h>
 #include <ATen/core/interned_strings.h>
 #include <c10/core/CPUAllocator.h>
 #include <c10/core/InferenceMode.h>
@@ -1363,6 +1364,7 @@ ProcessedNode::ProcessedNode(
 }
 
 void ProcessedNode::run() {
+  DCHECK(verify_outputs_not_overlapping_with_immutable_inputs());
   if (fn_) {
     fn_(this);
   } else if (native_fn_) {
@@ -1385,5 +1387,30 @@ void ProcessedNode::run() {
   }
 }
 
+bool ProcessedNode::verify_outputs_not_overlapping_with_immutable_inputs()
+    const {
+  auto schema = node()->maybeSchema();
+  if (!schema || schema->is_mutable()) {
+    return true;
+  }
+  for (const IValue* in : inputs_) {
+    if (!in->isTensor()) {
+      continue;
+    }
+    const auto& in_t = in->toTensor();
+    for (const IValue& out : outputs_) {
+      if (!out.isTensor()) {
+        continue;
+      }
+      const auto& out_t = out.toTensor();
+      at::MemOverlapStatus status = at::get_overlap_status(in_t, out_t);
+      if (status != at::MemOverlapStatus::NO) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index 5d3527246868a..e28dcc32b61e9 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -405,6 +405,8 @@ class ProcessedNode {
     return static_cast<bool>(fn_);
   }
 
+  bool verify_outputs_not_overlapping_with_immutable_inputs() const;
+
  private:
   Node* node_;
   c10::optional<Operation> op_;

From 94840969e48a228ed19a0289ebc55ce04f83f7ad Mon Sep 17 00:00:00 2001
From: Dimitrije Jankov <dimitrijejankov@fb.com>
Date: Mon, 12 Jul 2021 20:42:06 -0700
Subject: [PATCH 115/122] SGX can not read from /dev/urandom (#60368)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/60368

Problem:
The SGX secure enclave does not support reading from /dev/urandom as it is isolated from the OS for greater security. The SGX api provides a way to generate random numbers as a replacment.
Solution:
Conditionally enable SGX api for random number generation when building for it.

Test Plan: Run the PyTorch tests

Reviewed By: malfet, LiJihang

Differential Revision: D29022616

fbshipit-source-id: 1c7115457a2abde682df4d55fa4a8446fc5f8613
---
 c10/core/GeneratorImpl.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/c10/core/GeneratorImpl.cpp b/c10/core/GeneratorImpl.cpp
index 7fb5571b516f7..720b1b27a68a0 100644
--- a/c10/core/GeneratorImpl.cpp
+++ b/c10/core/GeneratorImpl.cpp
@@ -2,6 +2,10 @@
 #include <chrono>
 #include <random>
 
+#if defined(__SGX_ENABLED__)
+#include <sgx_trts.h>
+#endif
+
 #ifndef _WIN32
 #include <fcntl.h>
 #include <unistd.h>
@@ -57,7 +61,9 @@ static uint64_t readURandomLong() {
 /**
  * Gets a non deterministic random number number from either the
  * /dev/urandom or the current time. For CUDA, gets random from
- * std::random_device and adds a transformation on it.
+ * std::random_device and adds a transformation on it. For Intel SGX
+ * platform use sgx_read_rand as reading from /dev/urandom is
+ * prohibited on that platfrom.
  *
  * FIXME: The behavior in this function is from legacy code
  * (THRandom_seed/THCRandom_seed) and is probably not the right thing to do,
@@ -76,6 +82,10 @@ uint64_t getNonDeterministicRandom(bool is_cuda) {
     s = (uint64_t)std::chrono::high_resolution_clock::now()
             .time_since_epoch()
             .count();
+#elif defined(__SGX_ENABLED__)
+    TORCH_CHECK(
+        sgx_read_rand(reinterpret_cast<uint8_t*>(&s), sizeof(s)) == SGX_SUCCESS,
+        "Could not generate random number with sgx_read_rand.");
 #else
     s = readURandomLong();
 #endif

From 5144381b1d7ce6aebb159615c518b2472136b34c Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Mon, 12 Jul 2021 23:19:24 -0700
Subject: [PATCH 116/122] [pytorch][JIT] Widen exception caught by ScriptList
 casting (#61520)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61520

This commit widens the exception caught by the try-catch block that checks if
an object passed to a scripted function is a `ScriptList`. It turns out that
there are internal tests that do not throw a `py::cast_error` so catching only
that is not sufficient.

Test Plan: Ran the failing tests in T94889011.

Reviewed By: Chillee

Differential Revision: D29560815

fbshipit-source-id: 442258f8997146d833a9d5db923e1f6359f2bfdd
---
 torch/csrc/jit/python/pybind_utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index ffeb2cb9052b6..3e6294cb200e0 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -108,7 +108,7 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
       try {
         auto script_list = py::cast<ScriptList>(obj);
         return script_list.list_;
-      } catch (py::cast_error& e) {
+      } catch (...) {
       }
 
       // If not (i.e. it is a regular Python list), make a new

From 2e49c5dc37bce5ce8ba463a59aacb5d3e4a638b6 Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@fb.com>
Date: Tue, 13 Jul 2021 00:55:56 -0700
Subject: [PATCH 117/122] Move GetArgumentNamesModule registration to
 InterpreterManager() (#61549)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61549

Move GetArgumentNamesModule registration to InterpreterManager() such that the module is a permanent part of the interpreters and can be used by InterpreterSession.global() freely.

Test Plan: [... ~/fbsource/fbcode/caffe2] buck test mode/dev caffe2/fb/predictor:pytorch_predictor_test -- PyTorchDeployPredictor.GetArgumentNames

Reviewed By: wconstab

Differential Revision: D29643460

fbshipit-source-id: cf132d4795cbb334ce164ac715d590a105535508
---
 torch/csrc/deploy/deploy.cpp | 29 +++++++++++++++++++++++++++++
 torch/csrc/deploy/deploy.h   | 23 ++---------------------
 2 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/torch/csrc/deploy/deploy.cpp b/torch/csrc/deploy/deploy.cpp
index a84cbb7c92509..4ba68b57e3f3c 100644
--- a/torch/csrc/deploy/deploy.cpp
+++ b/torch/csrc/deploy/deploy.cpp
@@ -31,6 +31,35 @@ extern "C" __attribute__((
 namespace torch {
 namespace deploy {
 
+InterpreterManager::InterpreterManager(size_t n_interp) : resources_(n_interp) {
+  TORCH_DEPLOY_TRY
+  for (const auto i : c10::irange(n_interp)) {
+    instances_.emplace_back(this);
+    auto I = instances_.back().acquire_session();
+    // make torch.version.interp be the interpreter id
+    // can be used for balancing work across GPUs
+    I.global("torch", "version").attr("__setattr__")({"interp", int(i)});
+    // std::cerr << "Interpreter " << i << " initialized\n";
+    instances_.back().pImpl_->set_find_module(
+        [this](const std::string& name) -> at::optional<std::string> {
+          auto it = registered_module_sources_.find(name);
+          if (it != registered_module_sources_.end()) {
+            return it->second;
+          } else {
+            return at::nullopt;
+          }
+        });
+  }
+
+  // Pre-registered modules.
+  // TODO(jwtan): Make the discovery of these modules easier.
+  register_module_source(
+      "GetArgumentNamesModule",
+      "from inspect import signature\n"
+      "def getArgumentNames(function): return list(signature(function).parameters.keys())\n");
+  TORCH_DEPLOY_SAFE_CATCH_RETHROW
+}
+
 Package InterpreterManager::load_package(const std::string& uri) {
   TORCH_DEPLOY_TRY
   return Package(uri, this);
diff --git a/torch/csrc/deploy/deploy.h b/torch/csrc/deploy/deploy.h
index 3a7cfd1060715..5e84571486ca5 100644
--- a/torch/csrc/deploy/deploy.h
+++ b/torch/csrc/deploy/deploy.h
@@ -109,27 +109,8 @@ struct TORCH_API LoadBalancer {
 };
 
 struct TORCH_API InterpreterManager {
-  InterpreterManager(size_t n_interp = 2) : resources_(n_interp) {
-    TORCH_DEPLOY_TRY
-    for (const auto i : c10::irange(n_interp)) {
-      instances_.emplace_back(this);
-      auto I = instances_.back().acquire_session();
-      // make torch.version.interp be the interpreter id
-      // can be used for balancing work across GPUs
-      I.global("torch", "version").attr("__setattr__")({"interp", int(i)});
-      // std::cerr << "Interpreter " << i << " initialized\n";
-      instances_.back().pImpl_->set_find_module(
-          [this](const std::string& name) -> at::optional<std::string> {
-            auto it = registered_module_sources_.find(name);
-            if (it != registered_module_sources_.end()) {
-              return it->second;
-            } else {
-              return at::nullopt;
-            }
-          });
-    }
-    TORCH_DEPLOY_SAFE_CATCH_RETHROW
-  }
+  explicit InterpreterManager(size_t n_interp = 2);
+
   // get a free model, guarenteed that no other user of acquire_one has the same
   // model. It _is_ possible that other users will be using the interpreter.
   InterpreterSession acquire_one() {

From ac6ec0efa127f5d7d35e829b97b3cdd383c4beef Mon Sep 17 00:00:00 2001
From: Michael Melesse <micmelesse@gmail.com>
Date: Tue, 13 Jul 2021 07:06:58 -0700
Subject: [PATCH 118/122] [ROCM] fix bug in #60313 (#61073)

Summary:
This PR fixes a bug in https://github.com/pytorch/pytorch/issues/60313. Where the tensors generated by _generate_valid_rocfft_input are on the cpu instead of the gpu. This was due to using numpy to generate tensors and converting it to pytorch using torch.from_numpy. This leads to the generated tensors staying on the cpu. We now generate the tensors using pytorch itself which carries over the device type of the input tensors to the generated tensor.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61073

Reviewed By: H-Huang

Differential Revision: D29668418

Pulled By: malfet

fbshipit-source-id: ce2025c26d079c15603a89b9bf7878f48d73155e
---
 test/test_spectral_ops.py | 153 +++++++++++++++++++++++++++++++-------
 1 file changed, 125 insertions(+), 28 deletions(-)

diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index 7d57de12e56cf..90852540f51bc 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -102,27 +102,116 @@ class TestFFT(TestCase):
     # (i.e. it cannot be a set of random numbers)
     # So for ROCm, call np.fft.rfftn and use its output as the input
     # for testing ops that call hipfftExecC2R
-    def _generate_valid_rocfft_input(self, input, op):
-        # check if op can invoke hipfftExecC2R or hipfftExecZ2D
-        if type(op) == SpectralFuncInfo:
-            supported_ops = op.supported_dtypes("")
-            if not all(ctype in supported_ops for ctype in [torch.cfloat, torch.double]):
-                return input
-        else:
-            if op.__name__ in ["fft_rfft2"]:
-                return input
+    def _generate_valid_rocfft_input(self, input, op, s, dim, norm):
+        def get_op_name(op):
+            if type(op) == SpectralFuncInfo:
+                return op.name
+            else:
+                return op.__name__
+
+        op_name = get_op_name(op)
+
+        # pick ops that call hipfftExecC2R or hipfftExecZ2D
+        if op_name == "fft.irfft":
+            n = s
+            # figure out fft_size
+            if dim is None and n is None:
+                dim = tuple(range(-(input.dim()), 0))
+                s = [input.size(d) for d in dim]
+            elif dim is None and n is not None:
+                dim = -1
+                s = [n]
+            elif dim is not None and n is None:
+                s = [input.size(d) for d in [dim]]
+            else:
+                s = [n]
+            fft_size = s[-1]
 
+            # make fft_size even to match rocfft behavior to cuda and numpy
+            if (fft_size % 2) != 0:
+                n = fft_size + 1
 
-        # if input is complex use the real part
-        if torch.is_complex(input):
-            np_input_real = input.real.cpu().numpy()
-        else:
-            np_input_real = input.cpu().numpy()
+            # generate Hermitian symmetric input
+            if torch.is_complex(input):
+                valid_input = torch.fft.rfft(input.real, n=n, dim=dim, norm=norm)
+            else:
+                valid_input = torch.fft.rfft(input, n=n, dim=dim, norm=norm)
+
+            return (valid_input, n, dim, norm)
+        elif op_name == "fft.irfftn":
+            # figure out fft_size
+            if dim is None and s is None:
+                dim = tuple(range(-(input.dim()), 0))
+                s = [input.size(d) for d in dim]
+            elif dim is None and s is not None:
+                dim = tuple(range(-(len(s)), 0))
+            elif dim is not None and s is None:
+                s = [input.size(d) for d in dim]
+
+            fft_size = s[-1]
+
+            # make fft_size even to match rocfft behavior to cuda and numpy
+            if (fft_size % 2) != 0:
+                if type(s) is tuple:
+                    s = list(s)
+                    s[-1] = fft_size + 1
+
+            # generate Hermitian symmetric input
+            if torch.is_complex(input):
+                valid_input = torch.fft.rfftn(input.real, s=s, dim=dim, norm=norm)
+            else:
+                valid_input = torch.fft.rfftn(input, s=s, dim=dim, norm=norm)
+            return (valid_input, s, dim, norm)
+        elif op_name == "fft_irfft2":
+            # figure out fft_size
+            if dim is None and s is None:
+                dim = tuple(range(-(2), 0))
+                s = [input.size(d) for d in dim]
+            elif dim is None and s is not None:
+                dim = tuple(range(-(len(s)), 0))
+            elif dim is not None and s is None:
+                s = [input.size(d) for d in dim]
+            fft_size = s[-1]
+
+            # make fft_size even to match rocfft behavior to cuda and numpy
+            if (fft_size % 2) != 0:
+                if type(s) is tuple:
+                    s = list(s)
+                    s[-1] = fft_size + 1
+            # generate Hermitian symmetric input
+            if torch.is_complex(input):
+                valid_input = torch.fft.rfft2(input.real, s=s, dim=dim, norm=norm)
+            else:
+                valid_input = torch.fft.rfft2(input, s=s, dim=dim, norm=norm)
+            return (valid_input, s, dim, norm)
+        elif op_name == "fft.hfft":
+            n = s
+            # figure out fft_size
+            if dim is None and n is None:
+                dim = tuple(range(-(input.dim()), 0))
+                s = [input.size(d) for d in dim]
+            elif dim is None and n is not None:
+                dim = -1
+                s = [n]
+            elif dim is not None and n is None:
+                s = [input.size(d) for d in [dim]]
+            else:
+                s = [n]
+            fft_size = s[-1]
 
-        # generate Hermitian symmetric input using rfftn
-        rfft_output = np.fft.rfftn(np_input_real)
+            # make fft_size even to match rocfft behavior to cuda and numpy
+            if (fft_size % 2) != 0:
+                n = fft_size + 1
 
-        return torch.from_numpy(rfft_output)
+            # generate Hermitian symmetric input
+            if torch.is_complex(input):
+                valid_input = torch.fft.ihfft(input.real, n=n, dim=dim, norm=norm)
+            else:
+                valid_input = torch.fft.ihfft(input, n=n, dim=dim, norm=norm)
+
+            return (valid_input, n, dim, norm)
+        else:
+            return (input, s, dim, norm)
 
     @onlyOnCPUAndCUDA
     @ops([op for op in spectral_funcs if not op.ndimensional])
@@ -158,8 +247,9 @@ def test_reference_1d(self, device, dtype, op):
             input = args[0]
             args = args[1:]
 
-            if torch.version.hip is not None:
-                input = self._generate_valid_rocfft_input(input, op)
+            if torch.version.hip is not None and input.device.type == 'cuda':
+                input, args[0], args[1], args[2] = self._generate_valid_rocfft_input(
+                    input, op, args[0], args[1], args[2])
 
             expected = op.ref(input.cpu().numpy(), *args)
             exact_dtype = dtype in (torch.double, torch.complex128)
@@ -300,10 +390,10 @@ def test_reference_nd(self, device, dtype, op):
             shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim)
             input = torch.randn(*shape, device=device, dtype=dtype)
 
-            if torch.version.hip is not None:
-                input = self._generate_valid_rocfft_input(input, op)
-
             for norm in norm_modes:
+                if torch.version.hip is not None:
+                    input, s, dim, norm = self._generate_valid_rocfft_input(
+                        input, op, s, dim, norm)
                 expected = op.ref(input.cpu().numpy(), s, dim, norm)
                 exact_dtype = dtype in (torch.double, torch.complex128)
                 actual = op(input, s, dim, norm)
@@ -404,22 +494,29 @@ def fn(t: torch.Tensor, s: Optional[List[int]], dim: List[int] = (-2, -1), norm:
                 torch_fns = (torch_fn, torch.jit.script(fn))
 
                 if torch.version.hip is not None:
-                    valid_input = self._generate_valid_rocfft_input(input, torch_fn)
+                    valid_input_default, s, _, norm = self._generate_valid_rocfft_input(
+                        input, torch_fn, s, None, norm)
                 else:
-                    valid_input = input
+                    valid_input_default = input
 
                 # Once with dim defaulted
-                input_np = valid_input.cpu().numpy()
+                input_np = valid_input_default.cpu().numpy()
                 expected = numpy_fn(input_np, s, norm=norm)
                 for fn in torch_fns:
-                    actual = fn(valid_input, s, norm=norm)
+                    actual = fn(valid_input_default, s, norm=norm)
                     self.assertEqual(actual, expected)
 
                 # Once with explicit dims
                 dim = (1, 0)
-                expected = numpy_fn(valid_input.cpu(), s, dim, norm)
+                if torch.version.hip is not None:
+                    valid_input_explicit, s, dim, norm = self._generate_valid_rocfft_input(
+                        input, torch_fn, s, dim, norm)
+                else:
+                    valid_input_explicit = input
+
+                expected = numpy_fn(valid_input_explicit.cpu(), s, dim, norm)
                 for fn in torch_fns:
-                    actual = fn(valid_input, s, dim, norm)
+                    actual = fn(valid_input_explicit, s, dim, norm)
                     self.assertEqual(actual, expected)
 
     @skipCPUIfNoFFT

From 0afbb9e81e92a8a01b79c7ae47a4ea658be71e8a Mon Sep 17 00:00:00 2001
From: Tongliang Liao <xkszltl@gmail.com>
Date: Tue, 13 Jul 2021 07:07:09 -0700
Subject: [PATCH 119/122] `PYTHON_LIBRARY` may be set to empty or NOTFOUND.
 (#61230)

Summary:
Not sure why (maybe from dependencies?) but it can certainly break package lookup upon re-entry of cmake.
So instead of checking whether they are defined, we should check whether there is any meaningful value inside.

Fixes https://github.com/pytorch/pytorch/issues/59887

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61230

Reviewed By: H-Huang

Differential Revision: D29668766

Pulled By: malfet

fbshipit-source-id: 79a59578740c4434327aff4f9a22eba9c4bf48d1
---
 cmake/Dependencies.cmake | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 758a9a90a444d..8de30e2c41d7f 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -917,7 +917,7 @@ include_directories(SYSTEM ${EIGEN3_INCLUDE_DIR})
 # ---[ Python + Numpy
 if(BUILD_PYTHON)
   # If not given a Python installation, then use the current active Python
-  if(NOT DEFINED PYTHON_EXECUTABLE)
+  if(NOT PYTHON_EXECUTABLE)
     execute_process(
       COMMAND "which" "python" RESULT_VARIABLE _exitcode OUTPUT_VARIABLE _py_exe)
     if(${_exitcode} EQUAL 0)
@@ -946,7 +946,7 @@ if(BUILD_PYTHON)
   # executable that we already found (if we didn't actually find an executable
   # then these will just use "python", but at least they'll be consistent with
   # each other).
-  if(NOT DEFINED PYTHON_INCLUDE_DIR)
+  if(NOT PYTHON_INCLUDE_DIR)
     # TODO: Verify that sysconfig isn't inaccurate
     pycmd_no_exit(_py_inc _exitcode "import sysconfig; print(sysconfig.get_path('include'))")
     if("${_exitcode}" EQUAL 0 AND IS_DIRECTORY "${_py_inc}")
@@ -955,9 +955,9 @@ if(BUILD_PYTHON)
     else()
       message(WARNING "Could not set Python's include dir to ${_py_inc} from sysconfig")
     endif()
-  endif(NOT DEFINED PYTHON_INCLUDE_DIR)
+  endif(NOT PYTHON_INCLUDE_DIR)
 
-  if(NOT DEFINED PYTHON_LIBRARY)
+  if(NOT PYTHON_LIBRARY)
     pycmd_no_exit(_py_lib _exitcode "import sysconfig; print(sysconfig.get_path('stdlib'))")
     if("${_exitcode}" EQUAL 0 AND EXISTS "${_py_lib}" AND EXISTS "${_py_lib}")
       set(PYTHON_LIBRARY "${_py_lib}")
@@ -967,7 +967,7 @@ if(BUILD_PYTHON)
       endif()
       message(STATUS "Setting Python's library to ${PYTHON_LIBRARY}")
     endif()
-  endif(NOT DEFINED PYTHON_LIBRARY)
+  endif(NOT PYTHON_LIBRARY)
 
   # These should fill in the rest of the variables, like versions, but resepct
   # the variables we set above

From 9679fa7f306ec6e158375c1b48358cd98fde9f8d Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 13 Jul 2021 07:08:44 -0700
Subject: [PATCH 120/122] Update cpp_extension.py (#61484)

Summary:
By default, majority of Python-3.[6789] installation comes with `pkg_resources.packaging` version 16.8 (or `setuptool` older than 49.6.0), which does not have major/minor properties on Version package, as one can observe in https://github.com/pypa/setuptools/blob/v49.5.0/pkg_resources/_vendor/packaging/version.py
On the other hand, compare operators exists, so why not use it to check for version equality

Fixes https://github.com/pytorch/pytorch/issues/61036

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61484

Reviewed By: walterddr, seemethere

Differential Revision: D29643883

Pulled By: malfet

fbshipit-source-id: 3db9168c1b009ac3a278709083ea8c5b417471b8
---
 torch/utils/cpp_extension.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 92af491d35eb6..b313423426caa 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -773,12 +773,12 @@ def _check_cuda_version(self):
                 cuda_str_version = cuda_version.group(1)
                 cuda_ver = packaging.version.parse(cuda_str_version)
                 torch_cuda_version = packaging.version.parse(torch.version.cuda)
-                if cuda_ver.major != torch_cuda_version.major:  # type: ignore[attr-defined]
-                    raise RuntimeError(CUDA_MISMATCH_MESSAGE.format(
-                        cuda_str_version, torch.version.cuda))
-                elif cuda_ver.minor != torch_cuda_version.minor:  # type: ignore[attr-defined]
-                    warnings.warn(CUDA_MISMATCH_WARN.format(
-                        cuda_str_version, torch.version.cuda))
+                if cuda_ver != torch_cuda_version:
+                    # major/minor attributes are only available in setuptools>=49.6.0
+                    if getattr(cuda_ver, "major", float("nan")) != getattr(torch_cuda_version, "major", float("nan")):
+                        raise RuntimeError(CUDA_MISMATCH_MESSAGE.format(cuda_str_version, torch.version.cuda))
+                    warnings.warn(CUDA_MISMATCH_WARN.format(cuda_str_version, torch.version.cuda))
+
         else:
             raise RuntimeError(CUDA_NOT_FOUND_MESSAGE)
 

From 3e5d2b539d4704d368f88aa0079f5bb1d05a9607 Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Tue, 13 Jul 2021 08:18:10 -0700
Subject: [PATCH 121/122] Replace deprecated comment with C10_DEPRECATED in
 linalg.h (#60374)

Summary:
Replace // DEPRECATED comment with C10_DEPRECATED.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/60374

Reviewed By: H-Huang

Differential Revision: D29661630

Pulled By: heitorschueroff

fbshipit-source-id: fc086276fd7d3ddfb8d17c67ade456377ef0e990
---
 torch/csrc/api/include/torch/linalg.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch/csrc/api/include/torch/linalg.h b/torch/csrc/api/include/torch/linalg.h
index eedeb5a33bdb1..d3b5278bbe99c 100644
--- a/torch/csrc/api/include/torch/linalg.h
+++ b/torch/csrc/api/include/torch/linalg.h
@@ -222,7 +222,7 @@ inline Tensor cholesky_out(Tensor& result, const Tensor& self) {
   return detail::cholesky_out(result, self);
 }
 
-/// DEPRECATED
+// C10_DEPRECATED_MESSAGE("linalg_det is deprecated, use det instead.")
 inline Tensor linalg_det(const Tensor& self) {
   return detail::det(self);
 }
@@ -302,22 +302,22 @@ inline std::tuple<Tensor, Tensor, Tensor, Tensor> lstsq(const Tensor& self, cons
   return detail::lstsq(self, b, cond, driver);
 }
 
-/// DEPRECATED
+// C10_DEPRECATED_MESSAGE("linalg_norm is deprecated, use norm instead.")
 inline Tensor linalg_norm(const Tensor& self, const optional<Scalar>& opt_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::norm(self, opt_ord, opt_dim, keepdim, opt_dtype);
 }
 
-/// DEPRECATED
+// C10_DEPRECATED_MESSAGE("linalg_norm is deprecated, use norm instead.")
 inline Tensor linalg_norm(const Tensor& self, c10::string_view ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::norm(self, ord, opt_dim, keepdim, opt_dtype);
 }
 
-/// DEPRECATED
+// C10_DEPRECATED_MESSAGE("linalg_norm_out is deprecated, use norm_out instead.")
 inline Tensor& linalg_norm_out(Tensor& result, const Tensor& self, const optional<Scalar>& opt_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::norm_out(result, self, opt_ord, opt_dim, keepdim, opt_dtype);
 }
 
-/// DEPRECATED
+// C10_DEPRECATED_MESSAGE("linalg_norm_out is deprecated, use norm_out instead.")
 inline Tensor& linalg_norm_out(Tensor& result, const Tensor& self, c10::string_view ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
 }

From d5204064dcb4388959784bcddf60c5301d6979ac Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Tue, 13 Jul 2021 10:33:18 -0700
Subject: [PATCH 122/122] [BE] Fix flaky ProcessGroupGloo tests (#61396)

Summary:
A hypothesis as to why tests such as https://github.com/pytorch/pytorch/issues/57469 may be flaky is due to `c10d = ProcessGroupGloo(...)` is not actually guaranteed to be a synchronization point, so some ranks may create the PG, run all the error checking (which does not actually call into gloo APIs so doesn't require synchronization), and then exit, all before other ranks have created the gloo pg.

This can result in the following error:
```
File "distributed/test_c10d_gloo.py", line 1037, in test_reduce_checks
May 03 06:42:34     pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
May 03 06:42:34 RuntimeError: [/var/lib/jenkins/workspace/third_party/gloo/gloo/transport/tcp/pair.cc:598] Connection closed by peer [127.0.0.1]:35521
```

which indicates that the remote end has hung up. Furthermore all the flaky tests in this file only do error checking and don't call into the gloo APIs, further indicating that this issue may be the root cause. Not 100% sure this PR will fix it because I haven't been able to actually repro the issue even after 10000+ runs, but it happens regularly in CI.

To fix this, we add a `dist.barrier(group=pg)` call after creating the pg to enforce a synchronization. Would be good to land this and observe whether it helps with the flakiness.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61396

Reviewed By: mrshenli

Differential Revision: D29664189

Pulled By: rohan-varma

fbshipit-source-id: bc046d5d816fe6cb426522b85312383bfa3f90b7
---
 test/distributed/test_c10d_gloo.py | 66 ++++++++++++++++--------------
 1 file changed, 36 insertions(+), 30 deletions(-)

diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 1c319f1a9bda9..11b91edd419c6 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -208,6 +208,12 @@ def test_default_store_timeout_gloo(self):
     "TSAN is not fork-safe since we're forking in a multi-threaded environment",
 )
 class ProcessGroupGlooTest(MultiProcessTestCase):
+    def _create_process_group_gloo(self, store, rank, world_size, opts):
+        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts)
+        dist.barrier(group=pg)
+        return pg
+
+
     def setUp(self):
         super(ProcessGroupGlooTest, self).setUp()
 
@@ -232,7 +238,7 @@ def test_multi_device_constructor(self):
             create_device(interface=LOOPBACK),
             create_device(interface=LOOPBACK),
         ]
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts)
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, opts)
 
         # Execute 2x the number of operations to ensure we use every device.
         for fut in [pg.allreduce(torch.ones(i + 1)).get_future() for i in range(4)]:
@@ -240,7 +246,7 @@ def test_multi_device_constructor(self):
 
     def test_empty_tensors(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         xs = [torch.FloatTensor([])]
         fut = pg.broadcast(xs).get_future()
@@ -251,7 +257,7 @@ def test_empty_tensors(self):
 
     def test_broadcast_checks(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         t1 = torch.zeros([1], dtype=torch.float32)
         t2 = torch.zeros([1], dtype=torch.float64)
@@ -301,7 +307,7 @@ def test_broadcast_checks(self):
 
     def _test_broadcast_basics(self, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         def broadcast(xs, rootRank, rootTensor):
             opts = c10d.BroadcastOptions()
@@ -349,7 +355,7 @@ def test_broadcast_basics_cuda(self):
 
     def _test_broadcast_stress(self, inputs):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(
+        pg = self._create_process_group_gloo(
             store, self.rank, self.world_size, self.opts(threads=8)
         )
         work_handles = [
@@ -377,7 +383,7 @@ def test_broadcast_stress_cuda(self):
 
     def test_allreduce_checks(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         t1 = torch.zeros([1], dtype=torch.float32)
         t2 = torch.zeros([1], dtype=torch.float64)
@@ -397,7 +403,7 @@ def test_allreduce_checks(self):
 
     def _test_allreduce_basics(self, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         # Single input tests
         tests = simple_reduce_tests(self.rank, self.world_size)
@@ -444,7 +450,7 @@ def test_allreduce_basics_cuda(self):
     # This should go away as we deprecate it.
     def _test_allreduce_basics_using_work_api(self, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         # Single input tests
         tests = simple_reduce_tests(self.rank, self.world_size)
@@ -489,7 +495,7 @@ def test_allreduce_basics_cuda_using_work_api(self):
 
     def _test_allreduce_stress(self, inputs):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(
+        pg = self._create_process_group_gloo(
             store, self.rank, self.world_size, self.opts(threads=8)
         )
         future_handles = [pg.allreduce(inputs[i]).get_future() for i in range(len(inputs))]
@@ -519,7 +525,7 @@ def test_allreduce_stress_cuda(self):
 
     def test_allreduce_coalesced_checks(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         t1 = torch.zeros(1, dtype=torch.float32)
         t2 = torch.zeros(1, dtype=torch.float64)
@@ -544,7 +550,7 @@ def test_allreduce_coalesced_checks(self):
     @skip_if_lt_x_gpu(1)
     def test_allreduce_coalesced_checks_cuda(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         t1 = torch.zeros(1, dtype=torch.float32)
 
@@ -554,7 +560,7 @@ def test_allreduce_coalesced_checks_cuda(self):
 
     def _test_allreduce_coalesced_basics(self, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         test_cases = simple_coalesced_reduce_tests(self.rank, self.world_size)
         for op, inputs, outputs in test_cases:
@@ -573,7 +579,7 @@ def test_allreduce_coalesced_basics(self):
 
     def _test_allreduce_coalesced_stress(self, inputs):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(
+        pg = self._create_process_group_gloo(
             store, self.rank, self.world_size, self.opts(threads=8)
         )
         future_handles = [pg.allreduce_coalesced(input).get_future() for input in inputs]
@@ -601,7 +607,7 @@ def test_allreduce_coalesced_stress(self):
 
     def test_sparse_allreduce_checks(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         t1 = torch.zeros([1])
         t2 = torch.sparse_coo_tensor([[0]], [1], size=(2,))
@@ -628,7 +634,7 @@ def test_sparse_allreduce_checks(self):
 
     def _test_sparse_allreduce_basics(self, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         for num_inputs_per_rank in [1, 2]:
             tests = simple_sparse_reduce_tests(
@@ -652,7 +658,7 @@ def test_sparse_allreduce_basics_cuda(self):
 
     def test_scatter_checks(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         t1 = torch.zeros([1], dtype=torch.float32)
         t2 = torch.zeros([1], dtype=torch.float64)
@@ -727,7 +733,7 @@ def test_scatter_checks(self):
 
     def _test_scatter_basics(self, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         # Preallocate tensors for input/output
         input = [fn(torch.tensor([self.rank])) for _ in range(self.world_size)]
@@ -758,7 +764,7 @@ def test_scatter_basics_cuda(self):
 
     def _test_scatter_stress(self, inputs, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(
+        pg = self._create_process_group_gloo(
             store, self.rank, self.world_size, self.opts(threads=8)
         )
         outputs = [
@@ -808,7 +814,7 @@ def test_scatter_stress_cuda(self):
 
     def test_gather_checks(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         t1 = torch.zeros([1], dtype=torch.float32)
         t2 = torch.zeros([1], dtype=torch.float64)
@@ -887,7 +893,7 @@ def test_gather_checks(self):
 
     def _test_gather_basics(self, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         # Preallocate tensors for input/output
         input = [fn(torch.tensor([self.rank]))]
@@ -920,7 +926,7 @@ def test_gather_basics_cuda(self):
 
     def _test_gather_stress(self, inputs, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(
+        pg = self._create_process_group_gloo(
             store, self.rank, self.world_size, self.opts(threads=8)
         )
         future_handles = []
@@ -966,7 +972,7 @@ def test_gather_stress_cuda(self):
 
     def test_allgather_checks(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         t1 = torch.zeros([1], dtype=torch.float32)
         t2 = torch.zeros([1], dtype=torch.float64)
@@ -1009,7 +1015,7 @@ def test_allgather_checks(self):
 
     def _test_allgather_basics(self, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         # Run with N input tensor per rank
         for n in [1, 2, 3]:
@@ -1038,7 +1044,7 @@ def test_allgather_basics_cuda(self):
 
     def _test_allgather_stress(self, inputs, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(
+        pg = self._create_process_group_gloo(
             store, self.rank, self.world_size, self.opts(threads=8)
         )
         future_handles = []
@@ -1075,7 +1081,7 @@ def test_allgather_stress_cuda(self):
 
     def test_allgather_coalesced_checks(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
         dummy_input = [torch.zeros([1], dtype=torch.float32)]
         dummy_output_lists = [
             [torch.zeros([1], dtype=torch.float32)] for _ in range(self.world_size)
@@ -1111,7 +1117,7 @@ def test_allgather_coalesced_checks(self):
 
     def test_reduce_checks(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         t1 = torch.zeros([1], dtype=torch.float32)
 
@@ -1143,7 +1149,7 @@ def test_reduce_checks(self):
 
     def _test_reduce_basics(self, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
         for (op, input, output) in simple_reduce_tests(self.rank, self.world_size):
             for root in range(self.world_size):
                 opts = c10d.ReduceOptions()
@@ -1166,7 +1172,7 @@ def test_reduce_basics_cuda(self):
 
     def _test_reduce_stress(self, inputs):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(
+        pg = self._create_process_group_gloo(
             store, self.rank, self.world_size, self.opts(threads=8)
         )
         future_handles = []
@@ -1210,7 +1216,7 @@ def test_reduce_stress_cuda(self):
 
     def test_send_recv_all_to_all(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         # Preallocate tensors for input/output
         inputs = [torch.tensor([self.rank]) for _ in range(self.world_size)]
@@ -1248,7 +1254,7 @@ def test_send_recv_all_to_all(self):
 
     def test_barrier_implies_wait(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+        pg = self._create_process_group_gloo(store, self.rank, self.world_size, self.opts())
 
         # Kick off allreduce operations
         size = (100, 100)