- 
                Notifications
    You must be signed in to change notification settings 
- Fork 25.7k
Closed
Labels
module: ciRelated to continuous integrationRelated to continuous integrationmodule: linear algebraIssues related to specialized linear algebra operations in PyTorch; includes matrix multiply matmulIssues related to specialized linear algebra operations in PyTorch; includes matrix multiply matmultriagedThis issue has been looked at a team member, and triaged and prioritized into an appropriate moduleThis issue has been looked at a team member, and triaged and prioritized into an appropriate module
Description
🐛 Describe the bug
We are working on making CUDA 11.6 be our stable version of cuda, and hence moving jobs from 11.3 and 10.2 to CUDA 11.6.
I am observing following failure:
https://github.com/pytorch/pytorch/runs/8044912666?check_suite_focus=true
On this PR: #84120
ERROR [3.311s]: test_fn_gradgrad_linalg_det_singular_cuda_float64 (__main__.TestGradientsCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 1941, in wrapper
    method(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 1941, in wrapper
    method(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/testing/_internal/common_device_type.py", line 391, in instantiated_test
    raise rte
  File "/opt/conda/lib/python3.10/site-packages/torch/testing/_internal/common_device_type.py", line 378, in instantiated_test
    result = test(self, **param_kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/testing/_internal/common_device_type.py", line 853, in dep_fn
    return fn(slf, *args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/testing/_internal/common_device_type.py", line 815, in test_wrapper
    return test(*args, **kwargs)
  File "/var/lib/jenkins/workspace/test/test_ops_gradients.py", line 190, in test_fn_gradgrad
    self._check_helper(device, dtype, op, op.get_op(), 'bwgrad_bwgrad')
  File "/var/lib/jenkins/workspace/test/test_ops_gradients.py", line 133, in _check_helper
    self.assertTrue(gradgradcheck(fn, gradcheck_args, **kwargs))
  File "/opt/conda/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 3233, in gradgradcheck
    return torch.autograd.gradgradcheck(fn, inputs, grad_outputs, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/gradcheck.py", line 1574, in gradgradcheck
    return gradcheck(
  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/gradcheck.py", line 1418, in gradcheck
    return _gradcheck_helper(**args)
  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/gradcheck.py", line 1432, in _gradcheck_helper
    _gradcheck_real_imag(gradcheck_fn, func, func_out, tupled_inputs, outputs, eps,
  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/gradcheck.py", line 1075, in _gradcheck_real_imag
    gradcheck_fn(func, func_out, tupled_inputs, outputs, eps,
  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/gradcheck.py", line 1131, in _slow_gradcheck
    raise GradcheckError(_get_notallclose_msg(a, n, i, j, complex_indices, test_imag))
torch.autograd.gradcheck.GradcheckError: Jacobian mismatch for output 0 with respect to input 0,
numerical:tensor([[-1.2642e-07,  2.8158e-07,  1.7074e-07, -7.6408e-08,  1.8773e-02,
         -1.0848e-02, -2.0257e-08, -2.6593e-02, -2.0109e-02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 8.6736e-13,  0.0000e+00,  0.0000e+00, -1.8773e-02,  0.0000e+00,
          4.9516e-02,  2.6593e-02,  0.0000e+00,  8.8741e-03,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-9.3607e-08,  2.0849e-07,  1.2642e-07,  1.0848e-02, -4.9516e-02,
          7.6408e-08,  2.0109e-02, -8.8740e-03,  2.0257e-08,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 4.1836e-07, -1.8774e-02,  1.0848e-02,  2.5285e-07, -5.6315e-07,
         -3.4148e-07,  6.7034e-08, -1.9080e-02, -1.0416e-02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 1.8773e-02,  0.0000e+00, -4.9516e-02,  0.0000e+00,  0.0000e+00,
         -8.6736e-13,  1.9080e-02,  0.0000e+00, -2.5707e-03,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-1.0848e-02,  4.9516e-02,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          8.6736e-13,  1.0415e-02,  2.5707e-03,  2.1684e-13,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-8.6736e-13,  2.6593e-02,  2.0109e-02, -8.6736e-13,  1.9080e-02,
          1.0415e-02, -2.1684e-13,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.6593e-02,  3.4694e-12, -8.8741e-03, -1.9080e-02,  1.7347e-12,
          2.5707e-03,  0.0000e+00,  4.3368e-13,  2.1684e-13,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0109e-02,  8.8741e-03, -1.7347e-12, -1.0415e-02, -2.5707e-03,
         -8.6736e-13,  2.1684e-13,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  1.3878e-11,  0.0000e+00, -1.4490e-01,  3.5436e-01,
         -1.7347e-12,  5.9408e-02,  1.2011e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -6.8057e-06,
          2.9150e-06, -1.0446e-05,  1.4491e-01, -2.0972e-06,  7.0634e-02,
         -5.9410e-02,  9.1734e-07, -1.4263e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          3.4694e-12, -1.3878e-11, -3.5436e-01, -7.0626e-02,  6.9389e-12,
         -1.2011e-01,  1.4263e-01, -3.4694e-12],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          1.4490e-01, -3.5436e-01,  6.9389e-12,  0.0000e+00, -6.9389e-12,
          1.7347e-12,  2.8592e-03, -1.9792e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -1.4491e-01,
          1.7955e-06, -7.0632e-02,  3.0158e-06, -1.2917e-06,  4.6289e-06,
         -2.8605e-03,  5.6502e-07,  8.0385e-02],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  3.5436e-01,
          7.0625e-02,  3.5910e-06, -1.6832e-06,  7.2095e-07, -2.5835e-06,
          1.9792e-01, -8.0388e-02,  1.1300e-06],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -8.2096e-06,
         -5.9404e-02, -1.2012e-01,  5.9063e-06, -2.8617e-03,  1.9793e-01,
         -2.5835e-06,  1.1066e-06, -3.9653e-06],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  5.9427e-02,
         -8.2096e-06,  1.4266e-01,  2.8454e-03,  5.9063e-06, -8.0408e-02,
          6.0316e-06, -2.5835e-06,  9.2577e-06],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  1.2010e-01,
         -1.4263e-01, -8.2096e-06, -1.9792e-01,  8.0386e-02,  5.9063e-06,
         -1.6832e-06,  7.2095e-07, -2.5835e-06]], device='cuda:0',
       dtype=torch.float64)
analytical:tensor([[ 1.6487e-18,  2.5953e-19,  1.7094e-18,  1.0771e-18,  1.8773e-02,
         -1.0848e-02,  4.3586e-21, -2.6593e-02, -2.0109e-02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 4.7590e-19,  1.1971e-19, -1.6395e-18, -1.8773e-02, -3.5903e-18,
          4.9516e-02,  2.6593e-02,  3.9621e-19,  8.8741e-03,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 2.5358e-20, -5.2994e-18, -3.4862e-18,  1.0848e-02, -4.9516e-02,
         -2.9928e-19,  2.0109e-02, -8.8741e-03,  4.1405e-19,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-1.0177e-19, -1.8773e-02,  1.0848e-02,  1.0191e-19,  2.3322e-18,
          1.5783e-18,  1.0780e-19, -1.9080e-02, -1.0415e-02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 1.8773e-02,  2.9542e-20, -4.9516e-02,  4.6931e-18, -2.3037e-19,
          1.4534e-18,  1.9080e-02,  1.2146e-18, -2.5707e-03,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-1.0848e-02,  4.9516e-02,  2.0289e-18, -3.3563e-18,  2.1024e-18,
         -7.3856e-20,  1.0415e-02,  2.5707e-03, -7.5162e-19,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-1.5261e-18,  2.6593e-02,  2.0109e-02, -7.2256e-19,  1.9080e-02,
          1.0415e-02, -4.3381e-19, -2.0338e-19,  2.5978e-19,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.6593e-02, -8.5855e-19, -8.8741e-03, -1.9080e-02, -2.2611e-18,
          2.5707e-03,  2.0338e-19, -6.6969e-19, -3.7131e-19,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0109e-02,  8.8741e-03,  1.3362e-18, -1.0415e-02, -2.5707e-03,
          2.1464e-18,  1.7390e-19,  8.1441e-19,  2.5094e-19,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -7.7575e-18,
          1.1521e-17, -5.7016e-18, -1.1786e-17, -1.4490e-01,  3.5436e-01,
         -2.1773e-17,  5.9408e-02,  1.2011e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -2.5399e-17,
          8.2652e-18,  7.4699e-18,  1.4490e-01,  1.8690e-18,  7.0626e-02,
         -5.9408e-02, -1.1662e-17, -1.4263e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  5.5287e-17,
         -6.4569e-18, -1.9148e-17, -3.5436e-01, -7.0626e-02,  1.0427e-17,
         -1.2011e-01,  1.4263e-01,  2.1830e-17],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  3.8757e-17,
          1.4490e-01, -3.5436e-01,  3.6603e-17, -1.7146e-17,  5.1776e-17,
         -1.3481e-17,  2.8592e-03, -1.9792e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -1.4490e-01,
         -9.7294e-18, -7.0626e-02, -2.7567e-18,  3.4256e-18, -1.2143e-17,
         -2.8592e-03,  6.4823e-18,  8.0387e-02],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  3.5436e-01,
          7.0626e-02, -5.4981e-17,  3.7349e-18, -4.7476e-18,  2.1103e-17,
          1.9792e-01, -8.0387e-02, -1.5705e-17],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  1.1132e-18,
         -5.9408e-02, -1.2011e-01,  8.9919e-18, -2.8592e-03,  1.9792e-01,
         -1.7702e-17,  3.7265e-18, -2.0405e-17],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  5.9408e-02,
         -6.4541e-18,  1.4263e-01,  2.8592e-03,  1.8921e-18, -8.0387e-02,
          8.3473e-18, -4.1916e-18,  1.7186e-17],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  1.2011e-01,
         -1.4263e-01, -8.7791e-18, -1.9792e-01,  8.0387e-02,  1.5111e-17,
         -7.3510e-18,  3.4224e-18,  6.8696e-18]], device='cuda:0',
       dtype=torch.float64)
----------------------------------------------------------------------
Ran 7438 tests in 9875.920s
FAILED (errors=2, skipped=3940, expected failures=70)
Generating XML reports...
Generated XML report: test-reports/python-unittest/test_ops_gradients/TEST-TestGradientsCUDA-20220826220207.xml
Traceback (most recent call last):
  File "/var/lib/jenkins/workspace/test/run_test.py", line 1065, in <module>
    main()
  File "/var/lib/jenkins/workspace/test/run_test.py", line 1043, in main
    raise RuntimeError(err_message)
RuntimeError: test_ops_gradients failed!
real	180m56.922s
user	187m48.936s
sys	12m11.785s
Versions
Pytorch Nightly 1.13
CUDA 11.6
cc @seemethere @malfet @pytorch/pytorch-dev-infra @jianyuh @nikitaved @pearu @mruberry @walterddr @IvanYashchuk @xwang233 @lezcano
Metadata
Metadata
Assignees
Labels
module: ciRelated to continuous integrationRelated to continuous integrationmodule: linear algebraIssues related to specialized linear algebra operations in PyTorch; includes matrix multiply matmulIssues related to specialized linear algebra operations in PyTorch; includes matrix multiply matmultriagedThis issue has been looked at a team member, and triaged and prioritized into an appropriate moduleThis issue has been looked at a team member, and triaged and prioritized into an appropriate module
Type
Projects
Status
Done