pytorch · heitorschueroff · Oct 19, 2020 · Oct 20, 2020 · Oct 21, 2020 · Oct 23, 2020
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
@@ -28,11 +28,12 @@ static inline T pooling_output_shape_pad_lr(
     T outputSize = div_rtn<T>(
         inputSize + pad_l + pad_r - dilation * (kernelSize - 1) - 1 +
         (ceil_mode ? stride - 1 : 0), stride) + 1;
-    if (pad_l) {
+    if (ceil_mode) {
         // ensure that the last pooling starts inside the image
         // needed to avoid problems in ceil mode
-        if ((outputSize - 1) * stride >= inputSize + pad_l)
+        if ((outputSize - 1) * stride >= inputSize + pad_l + pad_r) {
           --outputSize;
+        }
     }
     return outputSize;
 }

diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -1563,6 +1563,17 @@ void _qavg_pool_nhwc_kernel(
           float multiplier = input_scale / output_scale / divide_factor;
           int input_zero_point_m_size = -input_zero_point * size;
 
+          if (size == 0) {
+            for (int c = 0; c < nInputPlane; ++c) {
+              o_p[c] = at::native::quantize_val<scalar_t>(
+                           1.0f / multiplier,
+                           output_zero_point,
+                           input_zero_point_m_size)
+                           .val_;
+            }
+            continue;
+          }
+
           int c_start = 0;
 
           // For int8 quantization, we implicitly use int32 as accumulation

diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
@@ -91,9 +91,9 @@ def pool_output_shape(input_size, kernel_size, padding, stride,
     output_size = (
         (input_size + 2 * padding - dilation * (kernel_size - 1) - 1
          + (stride - 1 if ceiling_mode else 0)) // stride + 1)
-    if (padding > 0 and
-            ((output_size - 1) * stride >= input_size + padding)):
-        output_size += 1
+    if (ceiling_mode and
+            ((output_size - 1) * stride >= input_size + 2 * padding)):
+        output_size -= 1
     return output_size
 
 """

diff --git a/test/test_nn.py b/test/test_nn.py
@@ -10511,6 +10511,15 @@ def test(nonlinearity, *args, **kwargs):
         test('threshold', 3, 2)
         test('threshold', 3, 2, inplace=True)
 
+    def test_pooling_shape(self, device):
+        def check(expected_out_shape, sizes, *args, **kwargs):
+            t = torch.randn(sizes, device=device)
+            self.assertEqual(torch.nn.functional.max_pool3d(t, *args, **kwargs).shape, expected_out_shape)
+
+        check((1, 1, 3, 4), (1, 1, 6, 7), kernel_size=1, stride=2, padding=0, ceil_mode=True)
+        check((1, 2, 3, 3), (1, 3, 4, 5), kernel_size=2, stride=2, padding=1, ceil_mode=False)
+        check((1, 3, 3, 4), (1, 3, 4, 5), kernel_size=2, stride=2, padding=1, ceil_mode=True)
+
     @onlyOnCPUAndCUDA   # TODO: fix on XLA
     def test_adaptive_avg_pool2d_output_size_one(self, device):
         def helper(size, memory_format):