Skip to content

Faster mul(sparse, sparse) with broadcasting in dense dims. #83428

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 51 commits into from

Conversation

nikitaved
Copy link
Collaborator

@nikitaved nikitaved commented Aug 15, 2022

Stack from ghstack (oldest at bottom):

Preliminary benchmarks (square matrices of shape (n, n)).

Script
import torch
import math
from IPython import get_ipython
from itertools import product, repeat
import pickle
from torch.utils.benchmark import Timer, Compare

torch.manual_seed(13)

# specifies (n, nnz)
problem_dims = (
    # n > nnz
    (10000, 100),
    (100000, 1000),
    (1000000, 10000),
    # n < nnz
    (10, 100),
    (10, 1000),
    (10, 10000),
    (100, 1000),
    (100, 10000),
    (1000, 10000),
    (1000, 100000),
    (1000, 1000000),
    #(1000000, 1000000000),
)

name = "PR"
device = "cuda"
results = []

for n, nnz in problem_dims:
    def gen_tensor(coalesce=False):
        shape = (n, n)
        nrows, ncols = shape
        rowidx = torch.randint(low=0, high=nrows, size=(nnz,), device=device)
        colidx = torch.randint(low=0, high=ncols, size=(nnz,), device=device)
        itemidx = torch.vstack((rowidx, colidx))
        xvalues = torch.randn(nnz, device=device)
        itemidx = torch.hstack((itemidx, itemidx))
        xvalues = torch.hstack((xvalues, xvalues))
        res = torch.sparse_coo_tensor(itemidx, xvalues, size=shape)
        if coalesce:
            return res.coalesce()
        else:
            return res


    for x_coalesce, y_coalesce in product(*repeat((True, False), 2)):
        x = gen_tensor(x_coalesce)
        y = gen_tensor(y_coalesce)
        smtp = "x * y"
        timer = Timer(smtp,
                      globals=globals(),
                      label="coo.mul",
                      description=f"{name}: mul, device: {device}",
                      sub_label=f"n={n}, nnz={nnz}, coalesce=({x_coalesce, y_coalesce})",
                      num_threads=torch.get_num_threads())
        results.append(timer.blocked_autorange())

compare = Compare(results)
compare.trim_significant_figures()
compare.print()

with open(f"{name}_{device}_mul.pickle", 'wb') as f:
    pickle.dump(results, f)
Gather results
import pickle
from torch.utils.benchmark import Timer, Compare

files = [
        "PR",
        "master"
        ]

device = 'cuda'

timers = []
for name in files:
    with open("{}_{}_mul.pickle".format(name, device), 'rb') as f:
        timers += pickle.load(f)

compare = Compare(timers)
compare.trim_significant_figures()
compare.print()
CUDA
[------------------------------------------------- coo.mul -------------------------------------------------]
                                                       |  PR: mul, device: cuda  |  master: mul, device: cuda
24 threads: -------------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |             95          |                91
      n=10000, nnz=100, coalesce=((True, False))       |             87          |               242
      n=10000, nnz=100, coalesce=((False, True))       |             87          |               226
      n=10000, nnz=100, coalesce=((False, False))      |            130          |               371
      n=100000, nnz=1000, coalesce=((True, True))      |            100          |               521
      n=100000, nnz=1000, coalesce=((True, False))     |             90          |               649
      n=100000, nnz=1000, coalesce=((False, True))     |            100          |               659
      n=100000, nnz=1000, coalesce=((False, False))    |            200          |               781
      n=1000000, nnz=10000, coalesce=((True, True))    |            100          |              4861
      n=1000000, nnz=10000, coalesce=((True, False))   |            100          |              5012
      n=1000000, nnz=10000, coalesce=((False, True))   |             98          |              5010
      n=1000000, nnz=10000, coalesce=((False, False))  |            384          |              5174
      n=10, nnz=100, coalesce=((True, True))           |            100          |                79
      n=10, nnz=100, coalesce=((True, False))          |            100          |               221
      n=10, nnz=100, coalesce=((False, True))          |            100          |               221
      n=10, nnz=100, coalesce=((False, False))         |            100          |               350
      n=10, nnz=1000, coalesce=((True, True))          |            100          |               100
      n=10, nnz=1000, coalesce=((True, False))         |            100          |               240
      n=10, nnz=1000, coalesce=((False, True))         |            100          |               254
      n=10, nnz=1000, coalesce=((False, False))        |            100          |               392
      n=10, nnz=10000, coalesce=((True, True))         |            100          |               110
      n=10, nnz=10000, coalesce=((True, False))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, True))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, False))       |            271          |               455
      n=100, nnz=1000, coalesce=((True, True))         |            110          |               851
      n=100, nnz=1000, coalesce=((True, False))        |            110          |              1000
      n=100, nnz=1000, coalesce=((False, True))        |            110          |               990
      n=100, nnz=1000, coalesce=((False, False))       |            140          |              1124
      n=100, nnz=10000, coalesce=((True, True))        |            110          |              5137
      n=100, nnz=10000, coalesce=((True, False))       |            110          |              5391
      n=100, nnz=10000, coalesce=((False, True))       |            100          |              5405
      n=100, nnz=10000, coalesce=((False, False))      |            249          |              5539
      n=1000, nnz=10000, coalesce=((True, True))       |            100          |              8598
      n=1000, nnz=10000, coalesce=((True, False))      |            100          |              8800
      n=1000, nnz=10000, coalesce=((False, True))      |            100          |              8782
      n=1000, nnz=10000, coalesce=((False, False))     |            255          |              8956
      n=1000, nnz=100000, coalesce=((True, True))      |            120          |             84500
      n=1000, nnz=100000, coalesce=((True, False))     |            200          |             88560
      n=1000, nnz=100000, coalesce=((False, True))     |            160          |             89000
      n=1000, nnz=100000, coalesce=((False, False))    |            373          |             89000
      n=1000, nnz=1000000, coalesce=((True, True))     |            312          |            606400
      n=1000, nnz=1000000, coalesce=((True, False))    |           1340          |            609200
      n=1000, nnz=1000000, coalesce=((False, True))    |           1340          |            609100
      n=1000, nnz=1000000, coalesce=((False, False))   |           4408          |            611400

Times are in microseconds (us).
CPU
[------------------------------------------------ coo.mul ------------------------------------------------]
                                                       |  PR: mul, device: cpu  |  master: mul, device: cpu
24 threads: -----------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |              8         |                8         
      n=10000, nnz=100, coalesce=((True, False))       |             32         |               34         
      n=10000, nnz=100, coalesce=((False, True))       |             32         |               34         
      n=10000, nnz=100, coalesce=((False, False))      |             41         |               56         
      n=100000, nnz=1000, coalesce=((True, True))      |             24         |               24         
      n=100000, nnz=1000, coalesce=((True, False))     |             90         |              100         
      n=100000, nnz=1000, coalesce=((False, True))     |             87         |              100         
      n=100000, nnz=1000, coalesce=((False, False))    |            231         |              255         
      n=1000000, nnz=10000, coalesce=((True, True))    |            190         |              200         
      n=1000000, nnz=10000, coalesce=((True, False))   |            908         |             2023         
      n=1000000, nnz=10000, coalesce=((False, True))   |            800         |             2036         
      n=1000000, nnz=10000, coalesce=((False, False))  |           3684         |             3989         
      n=10, nnz=100, coalesce=((True, True))           |              8         |                7         
      n=10, nnz=100, coalesce=((True, False))          |             34         |               30         
      n=10, nnz=100, coalesce=((False, True))          |             33         |               30         
      n=10, nnz=100, coalesce=((False, False))         |             44         |               50         
      n=10, nnz=1000, coalesce=((True, True))          |              8         |                7         
      n=10, nnz=1000, coalesce=((True, False))         |            100         |              100         
      n=10, nnz=1000, coalesce=((False, True))         |            130         |              100         
      n=10, nnz=1000, coalesce=((False, False))        |            746         |              210         
      n=10, nnz=10000, coalesce=((True, True))         |              8         |                7         
      n=10, nnz=10000, coalesce=((True, False))        |           1000         |             1500         
      n=10, nnz=10000, coalesce=((False, True))        |           1000         |             1510         
      n=10, nnz=10000, coalesce=((False, False))       |           3063         |             2457         
      n=100, nnz=1000, coalesce=((True, True))         |             25         |               25         
      n=100, nnz=1000, coalesce=((True, False))        |            180         |              130         
      n=100, nnz=1000, coalesce=((False, True))        |            200         |              130         
      n=100, nnz=1000, coalesce=((False, False))       |            271         |              255         
      n=100, nnz=10000, coalesce=((True, True))        |            100         |              100         
      n=100, nnz=10000, coalesce=((True, False))       |           2444         |             2290         
      n=100, nnz=10000, coalesce=((False, True))       |           2455         |             2357         
      n=100, nnz=10000, coalesce=((False, False))      |           5316         |             3783         
      n=1000, nnz=10000, coalesce=((True, True))       |            204         |              211         
      n=1000, nnz=10000, coalesce=((True, False))      |           2457         |             2480         
      n=1000, nnz=10000, coalesce=((False, True))      |           2448         |             2539         
      n=1000, nnz=10000, coalesce=((False, False))     |           3665         |             4801         
      n=1000, nnz=100000, coalesce=((True, True))      |           2293         |             2374         
      n=1000, nnz=100000, coalesce=((True, False))     |           9000         |            24620         
      n=1000, nnz=100000, coalesce=((False, True))     |           8000         |            25080         
      n=1000, nnz=100000, coalesce=((False, False))    |          26500         |            47650         
      n=1000, nnz=1000000, coalesce=((True, True))     |          10000         |            13000         
      n=1000, nnz=1000000, coalesce=((True, False))    |          80000         |           362200         
      n=1000, nnz=1000000, coalesce=((False, True))    |          78050         |           392600         
      n=1000, nnz=1000000, coalesce=((False, False))   |         312100         |           766900         

Times are in microseconds (us).

@facebook-github-bot
Copy link
Contributor

facebook-github-bot commented Aug 15, 2022

🔗 Helpful links

❌ 13 New Failures

As of commit 8f6bdea (more details on the Dr. CI page):

Expand to see more
  • 13/13 failures introduced in this PR

🕵️ 10 new failures recognized by patterns

The following CI failures do not appear to be due to upstream breakages

See GitHub Actions build pull / win-vs2019-cuda11.6-py3 / build (1/10)

Step: "Build" (full log | diagnosis details)

2022-09-06T11:40:52.9296214Z C:\actions-runner\...r initialization: '__this' is not a base or member
2022-09-06T11:40:52.9289419Z         [
2022-09-06T11:40:52.9289747Z             kernel_t=at::native::`anonymous-namespace'::CPUKernelLauncher,
2022-09-06T11:40:52.9290616Z             binary_op_t=at::native::`anonymous-namespace'::MulOp
2022-09-06T11:40:52.9290866Z         ]
2022-09-06T11:40:52.9291621Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen\native\sparse\SparseBinaryOpIntersectionKernel.cpp(32): note: see reference to function template instantiation 'at::Tensor &at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_out<at::native::`anonymous-namespace'::CPUKernelLauncher,at::native::`anonymous-namespace'::MulOp>(at::Tensor &,const at::Tensor &,const at::Tensor &,const bool)' being compiled
2022-09-06T11:40:52.9292554Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '_This': undeclared identifier
2022-09-06T11:40:52.9293159Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '<hash_coeffs>': undeclared identifier
2022-09-06T11:40:52.9293752Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '<sdim>': undeclared identifier
2022-09-06T11:40:52.9294483Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2439: 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_c340a7496557ab75b81f516d294aa920>::sdim': member could not be initialized
2022-09-06T11:40:52.9295330Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): note: see declaration of 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_c340a7496557ab75b81f516d294aa920>::sdim'
2022-09-06T11:40:52.9296214Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2614: 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_c340a7496557ab75b81f516d294aa920>': illegal member initialization: '__this' is not a base or member
2022-09-06T11:40:52.9297132Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): fatal error C1903: unable to recover from previous error(s); stopping compilation
2022-09-06T11:40:53.3103353Z [4793/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\sparse\SparseTensorMath.cpp.obj
2022-09-06T11:40:53.5942154Z [4794/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorBackward.cpp.obj
2022-09-06T11:40:53.8124180Z [4795/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorTransformerFunctions.cpp.obj
2022-09-06T11:40:53.9193950Z [4796/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorMath.cpp.obj
2022-09-06T11:40:54.0523227Z [4797/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorUtils.cpp.obj
2022-09-06T11:40:54.0933253Z [4798/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\quantized\AffineQuantizer.cpp.obj
2022-09-06T11:40:54.3003215Z [4799/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\quantized\FakeQuantPerChannelAffine.cpp.obj
2022-09-06T11:40:54.3006710Z ninja: build stopped: subcommand failed.
2022-09-06T11:40:54.3113528Z -- Building version 1.13.0a0+git8f6bdea

See GitHub Actions build linux-binary-libtorch-cxx11-abi / libtorch-cpu-shared-with-deps-cxx11-abi-test / build (2/10)

Step: "Download Build Artifacts" (full log | diagnosis details)

2022-09-06T12:06:21.3432639Z ##[error]Can't fin...actions/checkout before running your local action?
2022-09-06T12:06:21.3352115Z Emitted 'error' event on WriteStream instance at:
2022-09-06T12:06:21.3352381Z     at emitErrorNT (node:internal/streams/destroy:157:8)
2022-09-06T12:06:21.3352657Z     at emitErrorCloseNT (node:internal/streams/destroy:122:3)
2022-09-06T12:06:21.3352952Z     at processTicksAndRejections (node:internal/process/task_queues:83:21) {
2022-09-06T12:06:21.3353238Z   errno: -13,
2022-09-06T12:06:21.3353438Z   code: 'EACCES',
2022-09-06T12:06:21.3353629Z   syscall: 'open',
2022-09-06T12:06:21.3354134Z   path: '/home/ec2-user/actions-runner/_work/_temp/artifacts/debug-libtorch-cxx11-abi-shared-with-deps-1.13.0.dev20220906+cpu-2a006215.zip'
2022-09-06T12:06:21.3354504Z }
2022-09-06T12:06:21.3418569Z ##[error]Can't find 'action.yml', 'action.yaml' or 'Dockerfile' under '/home/ec2-user/actions-runner/_work/pytorch/pytorch/pytorch/.github/actions/teardown-linux'. Did you forget to run actions/checkout before running your local action?
2022-09-06T12:06:21.3432639Z ##[error]Can't find 'action.yml', 'action.yaml' or 'Dockerfile' under '/home/ec2-user/actions-runner/_work/pytorch/pytorch/pytorch/.github/actions/chown-workspace'. Did you forget to run actions/checkout before running your local action?
2022-09-06T12:06:21.3458288Z Post job cleanup.
2022-09-06T12:06:21.3484306Z Post job cleanup.
2022-09-06T12:06:21.4531552Z Cleaning up orphan processes

See GitHub Actions build windows-binary-libtorch-release / libtorch-cpu-shared-with-deps-release-build (3/10)

Step: "Build PyTorch binary" (full log | diagnosis details)

2022-09-06T11:40:11.4716212Z C:\actions-runner\...r initialization: '__this' is not a base or member
2022-09-06T11:40:11.4700474Z         [
2022-09-06T11:40:11.4703615Z             kernel_t=at::native::`anonymous-namespace'::CPUKernelLauncher,
2022-09-06T11:40:11.4704102Z             binary_op_t=at::native::`anonymous-namespace'::MulOp
2022-09-06T11:40:11.4704482Z         ]
2022-09-06T11:40:11.4705745Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\sparse\SparseBinaryOpIntersectionKernel.cpp(32): note: see reference to function template instantiation 'at::Tensor &at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_out<at::native::`anonymous-namespace'::CPUKernelLauncher,at::native::`anonymous-namespace'::MulOp>(at::Tensor &,const at::Tensor &,const at::Tensor &,const bool)' being compiled
2022-09-06T11:40:11.4707509Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '_This': undeclared identifier
2022-09-06T11:40:11.4709271Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '<hash_coeffs>': undeclared identifier
2022-09-06T11:40:11.4710981Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '<sdim>': undeclared identifier
2022-09-06T11:40:11.4712290Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2439: 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_96b9eb8ca71aff28f3d2715bd0d87e3c>::sdim': member could not be initialized
2022-09-06T11:40:11.4714501Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): note: see declaration of 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_96b9eb8ca71aff28f3d2715bd0d87e3c>::sdim'
2022-09-06T11:40:11.4716212Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2614: 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_96b9eb8ca71aff28f3d2715bd0d87e3c>': illegal member initialization: '__this' is not a base or member
2022-09-06T11:40:11.4717816Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): fatal error C1903: unable to recover from previous error(s); stopping compilation
2022-09-06T11:40:12.5915866Z [4857/5786] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorBackward.cpp.obj
2022-09-06T11:40:12.8396080Z [4858/5786] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\sparse\SparseTensorMath.cpp.obj
2022-09-06T11:40:12.9638832Z [4859/5786] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorTransformerFunctions.cpp.obj
2022-09-06T11:40:18.4447584Z [4860/5786] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\WeightNorm.cpp.obj
2022-09-06T11:40:27.5404389Z [4861/5786] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorUtils.cpp.obj
2022-09-06T11:40:28.5418262Z [4862/5786] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\quantized\Copy.cpp.obj
2022-09-06T11:40:30.2605622Z [4863/5786] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\quantized\TensorCompare.cpp.obj
2022-09-06T11:40:30.3386820Z [4864/5786] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\quantized\AffineQuantizer.cpp.obj
2022-09-06T11:40:30.5603616Z [4865/5786] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\quantized\QTensor.cpp.obj

See GitHub Actions build trunk / win-vs2019-cuda11.6-py3 / build (4/10)

Step: "Build" (full log | diagnosis details)

2022-09-06T11:39:59.0912460Z C:\actions-runner\...r initialization: '__this' is not a base or member
2022-09-06T11:39:59.0902314Z         [
2022-09-06T11:39:59.0902775Z             kernel_t=at::native::`anonymous-namespace'::CPUKernelLauncher,
2022-09-06T11:39:59.0903326Z             binary_op_t=at::native::`anonymous-namespace'::MulOp
2022-09-06T11:39:59.0903722Z         ]
2022-09-06T11:39:59.0904793Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen\native\sparse\SparseBinaryOpIntersectionKernel.cpp(32): note: see reference to function template instantiation 'at::Tensor &at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_out<at::native::`anonymous-namespace'::CPUKernelLauncher,at::native::`anonymous-namespace'::MulOp>(at::Tensor &,const at::Tensor &,const at::Tensor &,const bool)' being compiled
2022-09-06T11:39:59.0906412Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '_This': undeclared identifier
2022-09-06T11:39:59.0907416Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '<hash_coeffs>': undeclared identifier
2022-09-06T11:39:59.0908425Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '<sdim>': undeclared identifier
2022-09-06T11:39:59.0909797Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2439: 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_c340a7496557ab75b81f516d294aa920>::sdim': member could not be initialized
2022-09-06T11:39:59.0911218Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): note: see declaration of 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_c340a7496557ab75b81f516d294aa920>::sdim'
2022-09-06T11:39:59.0912460Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2614: 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_c340a7496557ab75b81f516d294aa920>': illegal member initialization: '__this' is not a base or member
2022-09-06T11:39:59.0913680Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): fatal error C1903: unable to recover from previous error(s); stopping compilation
2022-09-06T11:39:59.4193760Z [4896/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\sparse\SparseTensorMath.cpp.obj
2022-09-06T11:40:02.5024554Z [4897/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\WeightNorm.cpp.obj
2022-09-06T11:40:08.6858531Z [4898/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorUtils.cpp.obj
2022-09-06T11:40:08.8307053Z [4899/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorTransformerFunctions.cpp.obj
2022-09-06T11:40:11.2502816Z [4900/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\quantized\Copy.cpp.obj
2022-09-06T11:40:11.5131701Z [4901/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorBackward.cpp.obj
2022-09-06T11:40:12.2421104Z [4902/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorMath.cpp.obj
2022-09-06T11:40:12.2423708Z ninja: build stopped: subcommand failed.
2022-09-06T11:40:12.2525542Z -- Building version 1.13.0a0+git8f6bdea

See GitHub Actions build linux-binary-libtorch-pre-cxx11 / libtorch-cpu-shared-with-deps-cxx11-abi-test / build (5/10)

Step: "Download Build Artifacts" (full log | diagnosis details)

2022-09-06T12:06:23.3021466Z ##[error]Can't fin...actions/checkout before running your local action?
2022-09-06T12:06:23.2940458Z Emitted 'error' event on WriteStream instance at:
2022-09-06T12:06:23.2940724Z     at emitErrorNT (node:internal/streams/destroy:157:8)
2022-09-06T12:06:23.2940999Z     at emitErrorCloseNT (node:internal/streams/destroy:122:3)
2022-09-06T12:06:23.2941296Z     at processTicksAndRejections (node:internal/process/task_queues:83:21) {
2022-09-06T12:06:23.2941575Z   errno: -13,
2022-09-06T12:06:23.2941774Z   code: 'EACCES',
2022-09-06T12:06:23.2941961Z   syscall: 'open',
2022-09-06T12:06:23.2942460Z   path: '/home/ec2-user/actions-runner/_work/_temp/artifacts/debug-libtorch-cxx11-abi-shared-with-deps-1.13.0.dev20220906+cpu-e7f19104.zip'
2022-09-06T12:06:23.2942836Z }
2022-09-06T12:06:23.3007055Z ##[error]Can't find 'action.yml', 'action.yaml' or 'Dockerfile' under '/home/ec2-user/actions-runner/_work/pytorch/pytorch/pytorch/.github/actions/teardown-linux'. Did you forget to run actions/checkout before running your local action?
2022-09-06T12:06:23.3021466Z ##[error]Can't find 'action.yml', 'action.yaml' or 'Dockerfile' under '/home/ec2-user/actions-runner/_work/pytorch/pytorch/pytorch/.github/actions/chown-workspace'. Did you forget to run actions/checkout before running your local action?
2022-09-06T12:06:23.3047067Z Post job cleanup.
2022-09-06T12:06:23.3123256Z Post job cleanup.
2022-09-06T12:06:23.4172951Z Cleaning up orphan processes

See GitHub Actions build pull / linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge) (6/10)

Step: "Test" (full log | diagnosis details)

2022-09-06T11:50:36.4994110Z ##[error]Process completed with exit code 1.
2022-09-06T11:50:36.4951935Z Non-cacheable calls                   0
2022-09-06T11:50:36.4952336Z Non-compilation calls                 0
2022-09-06T11:50:36.4952540Z Unsupported compiler calls            0
2022-09-06T11:50:36.4952759Z Average cache write               0.000 s
2022-09-06T11:50:36.4952979Z Average cache read miss           0.000 s
2022-09-06T11:50:36.4953181Z Average cache read hit            0.000 s
2022-09-06T11:50:36.4953401Z Failed distributed compilations       0
2022-09-06T11:50:36.4953952Z Cache location                  S3, bucket: Bucket(name=ossci-compiler-cache-circleci-v2, base_url=http://ossci-compiler-cache-circleci-v2.s3.amazonaws.com/)
2022-09-06T11:50:36.4954389Z + echo ::endgroup::
2022-09-06T11:50:36.4954906Z ##[endgroup]
2022-09-06T11:50:36.4994110Z ##[error]Process completed with exit code 1.
2022-09-06T11:50:36.5032619Z Prepare all required actions
2022-09-06T11:50:36.5032915Z Getting action download info
2022-09-06T11:50:36.6541249Z ##[group]Run ./.github/actions/get-workflow-job-id
2022-09-06T11:50:36.6541476Z with:
2022-09-06T11:50:36.6541800Z   github-token: ***
2022-09-06T11:50:36.6541958Z env:
2022-09-06T11:50:36.6542129Z   GIT_DEFAULT_BRANCH: master
2022-09-06T11:50:36.6542312Z ##[endgroup]
2022-09-06T11:50:36.6569133Z ##[group]Run nick-fields/retry@7d4a37704547a311dbb66ebdf5b23ec19374a767
2022-09-06T11:50:36.6569368Z with:

See GitHub Actions build pull / win-vs2019-cpu-py3 / build (7/10)

Step: "Build" (full log | diagnosis details)

2022-09-06T11:39:32.7587804Z C:\actions-runner\...r initialization: '__this' is not a base or member
2022-09-06T11:39:32.7581923Z         [
2022-09-06T11:39:32.7582196Z             kernel_t=at::native::`anonymous-namespace'::CPUKernelLauncher,
2022-09-06T11:39:32.7582514Z             binary_op_t=at::native::`anonymous-namespace'::MulOp
2022-09-06T11:39:32.7582740Z         ]
2022-09-06T11:39:32.7583458Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen\native\sparse\SparseBinaryOpIntersectionKernel.cpp(32): note: see reference to function template instantiation 'at::Tensor &at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_out<at::native::`anonymous-namespace'::CPUKernelLauncher,at::native::`anonymous-namespace'::MulOp>(at::Tensor &,const at::Tensor &,const at::Tensor &,const bool)' being compiled
2022-09-06T11:39:32.7584329Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '_This': undeclared identifier
2022-09-06T11:39:32.7584905Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '<hash_coeffs>': undeclared identifier
2022-09-06T11:39:32.7585465Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '<sdim>': undeclared identifier
2022-09-06T11:39:32.7586173Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2439: 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_c340a7496557ab75b81f516d294aa920>::sdim': member could not be initialized
2022-09-06T11:39:32.7586977Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): note: see declaration of 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_c340a7496557ab75b81f516d294aa920>::sdim'
2022-09-06T11:39:32.7587804Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2614: 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_c340a7496557ab75b81f516d294aa920>': illegal member initialization: '__this' is not a base or member
2022-09-06T11:39:32.7588619Z C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): fatal error C1903: unable to recover from previous error(s); stopping compilation
2022-09-06T11:39:33.0066303Z [4848/5919] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorBackward.cpp.obj
2022-09-06T11:39:33.0695091Z [4849/5919] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorMath.cpp.obj
2022-09-06T11:39:33.3851273Z [4850/5919] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorTransformerFunctions.cpp.obj
2022-09-06T11:39:33.4473312Z [4851/5919] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\sparse\SparseTensorMath.cpp.obj
2022-09-06T11:39:33.8233635Z [4852/5919] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorUtils.cpp.obj
2022-09-06T11:39:33.8994654Z [4853/5919] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\quantized\Copy.cpp.obj
2022-09-06T11:39:35.2229692Z [4854/5919] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\WeightNorm.cpp.obj
2022-09-06T11:39:35.2232385Z ninja: build stopped: subcommand failed.
2022-09-06T11:39:35.2329250Z -- Building version 1.13.0a0+git8f6bdea

See GitHub Actions build pull / linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu) (8/10)

Step: "Install nvidia driver, nvidia-docker runtime, set GPU_FLAG" (full log | diagnosis details)

2022-09-06T11:52:56.9918272Z ##[error]Final attempt failed. Child_process exited with error code 1
2022-09-06T11:52:46.8758064Z 
2022-09-06T11:52:46.8758628Z Please refer to the following page for additional information and to install
2022-09-06T11:52:46.8759543Z optional driver components:
2022-09-06T11:52:46.8759920Z 
2022-09-06T11:52:46.8760150Z  http://negativo17.org/nvidia-driver/
2022-09-06T11:52:46.8760364Z 
2022-09-06T11:52:46.8760398Z 
2022-09-06T11:52:46.8760543Z (Answer: Abort installation)
2022-09-06T11:52:46.8761278Z ERROR: The installation was canceled due to the availability or presence of an alternate driver installation. Please see /var/log/nvidia-installer.log for more details.
2022-09-06T11:52:46.8763492Z + false
2022-09-06T11:52:56.9918272Z ##[error]Final attempt failed. Child_process exited with error code 1
2022-09-06T11:52:56.9919066Z 
2022-09-06T11:52:56.9919716Z 
2022-09-06T11:52:57.0003630Z Prepare all required actions
2022-09-06T11:52:57.0004027Z Getting action download info
2022-09-06T11:52:57.1755650Z ##[group]Run ./.github/actions/get-workflow-job-id
2022-09-06T11:52:57.1755962Z with:
2022-09-06T11:52:57.1756392Z   github-token: ***
2022-09-06T11:52:57.1756654Z env:
2022-09-06T11:52:57.1756912Z   GIT_DEFAULT_BRANCH: master
2022-09-06T11:52:57.1757164Z ##[endgroup]

See GitHub Actions build windows-binary-wheel / wheel-py3_7-cuda11_3-build (9/10)

Step: "Build PyTorch binary" (full log | diagnosis details)

2022-09-06T11:46:29.8030919Z C:\actions-runner\...r initialization: '__this' is not a base or member
2022-09-06T11:46:29.7985532Z         [
2022-09-06T11:46:29.7986029Z             kernel_t=at::native::`anonymous-namespace'::CPUKernelLauncher,
2022-09-06T11:46:29.7986797Z             binary_op_t=at::native::`anonymous-namespace'::MulOp
2022-09-06T11:46:29.7987807Z         ]
2022-09-06T11:46:29.7998330Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\sparse\SparseBinaryOpIntersectionKernel.cpp(32): note: see reference to function template instantiation 'at::Tensor &at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_out<at::native::`anonymous-namespace'::CPUKernelLauncher,at::native::`anonymous-namespace'::MulOp>(at::Tensor &,const at::Tensor &,const at::Tensor &,const bool)' being compiled
2022-09-06T11:46:29.8003749Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '_This': undeclared identifier
2022-09-06T11:46:29.8017379Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '<hash_coeffs>': undeclared identifier
2022-09-06T11:46:29.8022203Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2065: '<sdim>': undeclared identifier
2022-09-06T11:46:29.8026829Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2439: 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_96b9eb8ca71aff28f3d2715bd0d87e3c>::sdim': member could not be initialized
2022-09-06T11:46:29.8029310Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): note: see declaration of 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_96b9eb8ca71aff28f3d2715bd0d87e3c>::sdim'
2022-09-06T11:46:29.8030919Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): error C2614: 'at::native::`anonymous-namespace'::_sparse_binary_op_intersection_kernel_impl::<lambda_96b9eb8ca71aff28f3d2715bd0d87e3c>': illegal member initialization: '__this' is not a base or member
2022-09-06T11:46:29.8032241Z C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen/native/sparse/SparseBinaryOpIntersectionCommon.h(221): fatal error C1903: unable to recover from previous error(s); stopping compilation
2022-09-06T11:46:30.1628493Z [4895/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\ao_sparse\quantized\cpu\qlinear_unpack.cpp.obj
2022-09-06T11:46:36.5730020Z [4896/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\WeightNorm.cpp.obj
2022-09-06T11:46:47.5137406Z [4897/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorTransformerFunctions.cpp.obj
2022-09-06T11:46:47.7844445Z [4898/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorUtils.cpp.obj
2022-09-06T11:46:50.0973583Z [4899/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\quantized\QTensor.cpp.obj
2022-09-06T11:46:50.3366031Z [4900/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\nested\NestedTensorMath.cpp.obj
2022-09-06T11:46:50.5025472Z [4901/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\quantized\AffineQuantizer.cpp.obj
2022-09-06T11:46:50.5747634Z [4902/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\quantized\TensorCompare.cpp.obj
2022-09-06T11:46:50.6552199Z [4903/6421] Building CXX object caffe2\CMakeFiles\torch_cpu.dir\__\aten\src\ATen\native\quantized\FakeQuantPerChannelAffine.cpp.obj

See GitHub Actions build pull / linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu) (10/10)

Step: "Install nvidia driver, nvidia-docker runtime, set GPU_FLAG" (full log | diagnosis details)

2022-09-06T11:52:47.2701968Z ##[error]Final attempt failed. Child_process exited with error code 1
2022-09-06T11:52:37.1010806Z 
2022-09-06T11:52:37.1011030Z Please refer to the following page for additional information and to install
2022-09-06T11:52:37.1011390Z optional driver components:
2022-09-06T11:52:37.1011568Z 
2022-09-06T11:52:37.1011812Z  http://negativo17.org/nvidia-driver/
2022-09-06T11:52:37.1012015Z 
2022-09-06T11:52:37.1012034Z 
2022-09-06T11:52:37.1012160Z (Answer: Abort installation)
2022-09-06T11:52:37.1012798Z ERROR: The installation was canceled due to the availability or presence of an alternate driver installation. Please see /var/log/nvidia-installer.log for more details.
2022-09-06T11:52:37.1016380Z + false
2022-09-06T11:52:47.2701968Z ##[error]Final attempt failed. Child_process exited with error code 1
2022-09-06T11:52:47.2702694Z 
2022-09-06T11:52:47.2703254Z 
2022-09-06T11:52:47.2778845Z Prepare all required actions
2022-09-06T11:52:47.2779223Z Getting action download info
2022-09-06T11:52:47.4467994Z ##[group]Run ./.github/actions/get-workflow-job-id
2022-09-06T11:52:47.4468298Z with:
2022-09-06T11:52:47.4468699Z   github-token: ***
2022-09-06T11:52:47.4468915Z env:
2022-09-06T11:52:47.4469151Z   GIT_DEFAULT_BRANCH: master
2022-09-06T11:52:47.4469407Z ##[endgroup]

🕵️‍♀️ 3 failures not recognized by patterns:

The following CI failures may be due to changes from the PR
Job Step
GitHub Actions trunk / linux-bionic-cuda10.2-py3.9-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu) Unknown
GitHub Actions libtorch-cpu-shared-with-deps-debug-build Unknown
CircleCI Checks build Unknown

This comment was automatically generated by Dr. CI (expand for details).

Please report bugs/suggestions to the (internal) Dr. CI Users group.

Click here to manually regenerate this comment.

@nikitaved
Copy link
Collaborator Author

This one is not ready yet. I want to see the CI's outputs and I will post benchmarks later.

nikitaved added a commit that referenced this pull request Aug 16, 2022
@nikitaved nikitaved added the ciflow/trunk Trigger trunk jobs on your pull request label Aug 16, 2022
nikitaved added a commit that referenced this pull request Aug 16, 2022
@pytorchmergebot
Copy link
Collaborator

@pytorchbot successfully started a rebase job. Check the current status here

Preliminary benchmarks (square matrices of shape (n, n)).

<details>

<summary>Script</summary>

```python
import torch
import math
from IPython import get_ipython
from itertools import product, repeat
import pickle
from torch.utils.benchmark import Timer, Compare

torch.manual_seed(13)

# specifies (n, nnz)
problem_dims = (
    # n > nnz
    (10000, 100),
    (100000, 1000),
    (1000000, 10000),
    # n < nnz
    (10, 100),
    (10, 1000),
    (10, 10000),
    (100, 1000),
    (100, 10000),
    (1000, 10000),
    (1000, 100000),
    (1000, 1000000),
    #(1000000, 1000000000),
)

name = "PR"
device = "cuda"
results = []

for n, nnz in problem_dims:
    def gen_tensor(coalesce=False):
        shape = (n, n)
        nrows, ncols = shape
        rowidx = torch.randint(low=0, high=nrows, size=(nnz,), device=device)
        colidx = torch.randint(low=0, high=ncols, size=(nnz,), device=device)
        itemidx = torch.vstack((rowidx, colidx))
        xvalues = torch.randn(nnz, device=device)
        itemidx = torch.hstack((itemidx, itemidx))
        xvalues = torch.hstack((xvalues, xvalues))
        res = torch.sparse_coo_tensor(itemidx, xvalues, size=shape)
        if coalesce:
            return res.coalesce()
        else:
            return res


    for x_coalesce, y_coalesce in product(*repeat((True, False), 2)):
        x = gen_tensor(x_coalesce)
        y = gen_tensor(y_coalesce)
        smtp = "x * y"
        timer = Timer(smtp,
                      globals=globals(),
                      label="coo.mul",
                      description=f"{name}: mul, device: {device}",
                      sub_label=f"n={n}, nnz={nnz}, coalesce=({x_coalesce, y_coalesce})",
                      num_threads=torch.get_num_threads())
        results.append(timer.blocked_autorange())

compare = Compare(results)
compare.trim_significant_figures()
compare.print()

with open(f"{name}_{device}_mul.pickle", 'wb') as f:
    pickle.dump(results, f)

```

</details>

<details>

<summary>Gather results</summary>

```python
import pickle
from torch.utils.benchmark import Timer, Compare

files = [
        "PR",
        "master"
        ]

device = 'cuda'

timers = []
for name in files:
    with open("{}_{}_mul.pickle".format(name, device), 'rb') as f:
        timers += pickle.load(f)

compare = Compare(timers)
compare.trim_significant_figures()
compare.print()

```

</details>

<details>

<summary>CUDA</summary>

```
[------------------------------------------------- coo.mul -------------------------------------------------]
                                                       |  PR: mul, device: cuda  |  master: mul, device: cuda
24 threads: -------------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |             95          |                91
      n=10000, nnz=100, coalesce=((True, False))       |             87          |               242
      n=10000, nnz=100, coalesce=((False, True))       |             87          |               226
      n=10000, nnz=100, coalesce=((False, False))      |            130          |               371
      n=100000, nnz=1000, coalesce=((True, True))      |            100          |               521
      n=100000, nnz=1000, coalesce=((True, False))     |             90          |               649
      n=100000, nnz=1000, coalesce=((False, True))     |            100          |               659
      n=100000, nnz=1000, coalesce=((False, False))    |            200          |               781
      n=1000000, nnz=10000, coalesce=((True, True))    |            100          |              4861
      n=1000000, nnz=10000, coalesce=((True, False))   |            100          |              5012
      n=1000000, nnz=10000, coalesce=((False, True))   |             98          |              5010
      n=1000000, nnz=10000, coalesce=((False, False))  |            384          |              5174
      n=10, nnz=100, coalesce=((True, True))           |            100          |                79
      n=10, nnz=100, coalesce=((True, False))          |            100          |               221
      n=10, nnz=100, coalesce=((False, True))          |            100          |               221
      n=10, nnz=100, coalesce=((False, False))         |            100          |               350
      n=10, nnz=1000, coalesce=((True, True))          |            100          |               100
      n=10, nnz=1000, coalesce=((True, False))         |            100          |               240
      n=10, nnz=1000, coalesce=((False, True))         |            100          |               254
      n=10, nnz=1000, coalesce=((False, False))        |            100          |               392
      n=10, nnz=10000, coalesce=((True, True))         |            100          |               110
      n=10, nnz=10000, coalesce=((True, False))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, True))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, False))       |            271          |               455
      n=100, nnz=1000, coalesce=((True, True))         |            110          |               851
      n=100, nnz=1000, coalesce=((True, False))        |            110          |              1000
      n=100, nnz=1000, coalesce=((False, True))        |            110          |               990
      n=100, nnz=1000, coalesce=((False, False))       |            140          |              1124
      n=100, nnz=10000, coalesce=((True, True))        |            110          |              5137
      n=100, nnz=10000, coalesce=((True, False))       |            110          |              5391
      n=100, nnz=10000, coalesce=((False, True))       |            100          |              5405
      n=100, nnz=10000, coalesce=((False, False))      |            249          |              5539
      n=1000, nnz=10000, coalesce=((True, True))       |            100          |              8598
      n=1000, nnz=10000, coalesce=((True, False))      |            100          |              8800
      n=1000, nnz=10000, coalesce=((False, True))      |            100          |              8782
      n=1000, nnz=10000, coalesce=((False, False))     |            255          |              8956
      n=1000, nnz=100000, coalesce=((True, True))      |            120          |             84500
      n=1000, nnz=100000, coalesce=((True, False))     |            200          |             88560
      n=1000, nnz=100000, coalesce=((False, True))     |            160          |             89000
      n=1000, nnz=100000, coalesce=((False, False))    |            373          |             89000
      n=1000, nnz=1000000, coalesce=((True, True))     |            312          |            606400
      n=1000, nnz=1000000, coalesce=((True, False))    |           1340          |            609200
      n=1000, nnz=1000000, coalesce=((False, True))    |           1340          |            609100
      n=1000, nnz=1000000, coalesce=((False, False))   |           4408          |            611400

Times are in microseconds (us).
```

</details>

<details>

<summary>CPU</summary>

```
[------------------------------------------------ coo.mul ------------------------------------------------]
                                                       |  PR: mul, device: cpu  |  master: mul, device: cpu
24 threads: -----------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |              8         |                8         
      n=10000, nnz=100, coalesce=((True, False))       |             32         |               34         
      n=10000, nnz=100, coalesce=((False, True))       |             32         |               34         
      n=10000, nnz=100, coalesce=((False, False))      |             41         |               56         
      n=100000, nnz=1000, coalesce=((True, True))      |             24         |               24         
      n=100000, nnz=1000, coalesce=((True, False))     |             90         |              100         
      n=100000, nnz=1000, coalesce=((False, True))     |             87         |              100         
      n=100000, nnz=1000, coalesce=((False, False))    |            231         |              255         
      n=1000000, nnz=10000, coalesce=((True, True))    |            190         |              200         
      n=1000000, nnz=10000, coalesce=((True, False))   |            908         |             2023         
      n=1000000, nnz=10000, coalesce=((False, True))   |            800         |             2036         
      n=1000000, nnz=10000, coalesce=((False, False))  |           3684         |             3989         
      n=10, nnz=100, coalesce=((True, True))           |              8         |                7         
      n=10, nnz=100, coalesce=((True, False))          |             34         |               30         
      n=10, nnz=100, coalesce=((False, True))          |             33         |               30         
      n=10, nnz=100, coalesce=((False, False))         |             44         |               50         
      n=10, nnz=1000, coalesce=((True, True))          |              8         |                7         
      n=10, nnz=1000, coalesce=((True, False))         |            100         |              100         
      n=10, nnz=1000, coalesce=((False, True))         |            130         |              100         
      n=10, nnz=1000, coalesce=((False, False))        |            746         |              210         
      n=10, nnz=10000, coalesce=((True, True))         |              8         |                7         
      n=10, nnz=10000, coalesce=((True, False))        |           1000         |             1500         
      n=10, nnz=10000, coalesce=((False, True))        |           1000         |             1510         
      n=10, nnz=10000, coalesce=((False, False))       |           3063         |             2457         
      n=100, nnz=1000, coalesce=((True, True))         |             25         |               25         
      n=100, nnz=1000, coalesce=((True, False))        |            180         |              130         
      n=100, nnz=1000, coalesce=((False, True))        |            200         |              130         
      n=100, nnz=1000, coalesce=((False, False))       |            271         |              255         
      n=100, nnz=10000, coalesce=((True, True))        |            100         |              100         
      n=100, nnz=10000, coalesce=((True, False))       |           2444         |             2290         
      n=100, nnz=10000, coalesce=((False, True))       |           2455         |             2357         
      n=100, nnz=10000, coalesce=((False, False))      |           5316         |             3783         
      n=1000, nnz=10000, coalesce=((True, True))       |            204         |              211         
      n=1000, nnz=10000, coalesce=((True, False))      |           2457         |             2480         
      n=1000, nnz=10000, coalesce=((False, True))      |           2448         |             2539         
      n=1000, nnz=10000, coalesce=((False, False))     |           3665         |             4801         
      n=1000, nnz=100000, coalesce=((True, True))      |           2293         |             2374         
      n=1000, nnz=100000, coalesce=((True, False))     |           9000         |            24620         
      n=1000, nnz=100000, coalesce=((False, True))     |           8000         |            25080         
      n=1000, nnz=100000, coalesce=((False, False))    |          26500         |            47650         
      n=1000, nnz=1000000, coalesce=((True, True))     |          10000         |            13000         
      n=1000, nnz=1000000, coalesce=((True, False))    |          80000         |           362200         
      n=1000, nnz=1000000, coalesce=((False, True))    |          78050         |           392600         
      n=1000, nnz=1000000, coalesce=((False, False))   |         312100         |           766900         

Times are in microseconds (us).
```

</details>


[ghstack-poisoned]
@pytorchmergebot
Copy link
Collaborator

Successfully rebased gh/nikitaved/2/orig onto refs/remotes/origin/master, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/83428)

pytorchmergebot pushed a commit that referenced this pull request Sep 15, 2022
@cpuhrsch
Copy link
Contributor

@pytorchbot merge -g

@pytorchmergebot
Copy link
Collaborator

@pytorchbot successfully started a merge job. Check the current status here.
The merge job was triggered with the green (-g) flag. This means that your change will be merged once all checks on your PR have passed (ETA: 0-4 Hours). If this is not the intended behavior, feel free to use some of the other merge options in the wiki.
Please reach out to the PyTorch DevX Team with feedback or questions!

@github-actions
Copy link
Contributor

Hey @nikitaved.
You've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.
For changes that are 'topic: not user facing' there is no need for a release notes label.

@cpuhrsch cpuhrsch added the topic: new features topic category label Sep 16, 2022
@osalpekar
Copy link
Member

@pytorchbot revert -m "Reverted because __restrict symbol not supported by certain MSVC compilers, leading to undefined symbol error at compilation time" -c=ghfirst

@pytorchmergebot
Copy link
Collaborator

@pytorchbot successfully started a revert job. Check the current status here.
Please reach out to the PyTorch DevX Team with feedback or questions!

@pytorchmergebot
Copy link
Collaborator

@nikitaved your PR has been successfully reverted.

pytorchmergebot added a commit that referenced this pull request Sep 17, 2022
…83428)"

This reverts commit d49943b.

Reverted #83428 on behalf of https://github.com/osalpekar due to Reverted because __restrict symbol not supported by certain MSVC compilers, leading to undefined symbol error at compilation time
@nikitaved
Copy link
Collaborator Author

Not the first time I use it just the same. I wonder why it never backfired before...

@facebook-github-bot facebook-github-bot deleted the gh/nikitaved/2/head branch September 19, 2022 14:19
@nikitaved nikitaved restored the gh/nikitaved/2/head branch September 20, 2022 10:36
nikitaved added a commit that referenced this pull request Sep 20, 2022
Preliminary benchmarks (square matrices of shape (n, n)).

<details>

<summary>Script</summary>

```python
import torch
import math
from IPython import get_ipython
from itertools import product, repeat
import pickle
from torch.utils.benchmark import Timer, Compare

torch.manual_seed(13)

problem_dims = (
    # n > nnz
    (10000, 100),
    (100000, 1000),
    (1000000, 10000),
    # n < nnz
    (10, 100),
    (10, 1000),
    (10, 10000),
    (100, 1000),
    (100, 10000),
    (1000, 10000),
    (1000, 100000),
    (1000, 1000000),
    #(1000000, 1000000000),
)

name = "PR"
device = "cuda"
results = []

for n, nnz in problem_dims:
    def gen_tensor(coalesce=False):
        shape = (n, n)
        nrows, ncols = shape
        rowidx = torch.randint(low=0, high=nrows, size=(nnz,), device=device)
        colidx = torch.randint(low=0, high=ncols, size=(nnz,), device=device)
        itemidx = torch.vstack((rowidx, colidx))
        xvalues = torch.randn(nnz, device=device)
        itemidx = torch.hstack((itemidx, itemidx))
        xvalues = torch.hstack((xvalues, xvalues))
        res = torch.sparse_coo_tensor(itemidx, xvalues, size=shape)
        if coalesce:
            return res.coalesce()
        else:
            return res

    for x_coalesce, y_coalesce in product(*repeat((True, False), 2)):
        x = gen_tensor(x_coalesce)
        y = gen_tensor(y_coalesce)
        smtp = "x * y"
        timer = Timer(smtp,
                      globals=globals(),
                      label="coo.mul",
                      description=f"{name}: mul, device: {device}",
                      sub_label=f"n={n}, nnz={nnz}, coalesce=({x_coalesce, y_coalesce})",
                      num_threads=torch.get_num_threads())
        results.append(timer.blocked_autorange())

compare = Compare(results)
compare.trim_significant_figures()
compare.print()

with open(f"{name}_{device}_mul.pickle", 'wb') as f:
    pickle.dump(results, f)

```

</details>

<details>

<summary>Gather results</summary>

```python
import pickle
from torch.utils.benchmark import Timer, Compare

files = [
        "PR",
        "master"
        ]

device = 'cuda'

timers = []
for name in files:
    with open("{}_{}_mul.pickle".format(name, device), 'rb') as f:
        timers += pickle.load(f)

compare = Compare(timers)
compare.trim_significant_figures()
compare.print()

```

</details>

<details>

<summary>CUDA</summary>

```
[------------------------------------------------- coo.mul -------------------------------------------------]
                                                       |  PR: mul, device: cuda  |  master: mul, device: cuda
24 threads: -------------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |             95          |                91
      n=10000, nnz=100, coalesce=((True, False))       |             87          |               242
      n=10000, nnz=100, coalesce=((False, True))       |             87          |               226
      n=10000, nnz=100, coalesce=((False, False))      |            130          |               371
      n=100000, nnz=1000, coalesce=((True, True))      |            100          |               521
      n=100000, nnz=1000, coalesce=((True, False))     |             90          |               649
      n=100000, nnz=1000, coalesce=((False, True))     |            100          |               659
      n=100000, nnz=1000, coalesce=((False, False))    |            200          |               781
      n=1000000, nnz=10000, coalesce=((True, True))    |            100          |              4861
      n=1000000, nnz=10000, coalesce=((True, False))   |            100          |              5012
      n=1000000, nnz=10000, coalesce=((False, True))   |             98          |              5010
      n=1000000, nnz=10000, coalesce=((False, False))  |            384          |              5174
      n=10, nnz=100, coalesce=((True, True))           |            100          |                79
      n=10, nnz=100, coalesce=((True, False))          |            100          |               221
      n=10, nnz=100, coalesce=((False, True))          |            100          |               221
      n=10, nnz=100, coalesce=((False, False))         |            100          |               350
      n=10, nnz=1000, coalesce=((True, True))          |            100          |               100
      n=10, nnz=1000, coalesce=((True, False))         |            100          |               240
      n=10, nnz=1000, coalesce=((False, True))         |            100          |               254
      n=10, nnz=1000, coalesce=((False, False))        |            100          |               392
      n=10, nnz=10000, coalesce=((True, True))         |            100          |               110
      n=10, nnz=10000, coalesce=((True, False))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, True))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, False))       |            271          |               455
      n=100, nnz=1000, coalesce=((True, True))         |            110          |               851
      n=100, nnz=1000, coalesce=((True, False))        |            110          |              1000
      n=100, nnz=1000, coalesce=((False, True))        |            110          |               990
      n=100, nnz=1000, coalesce=((False, False))       |            140          |              1124
      n=100, nnz=10000, coalesce=((True, True))        |            110          |              5137
      n=100, nnz=10000, coalesce=((True, False))       |            110          |              5391
      n=100, nnz=10000, coalesce=((False, True))       |            100          |              5405
      n=100, nnz=10000, coalesce=((False, False))      |            249          |              5539
      n=1000, nnz=10000, coalesce=((True, True))       |            100          |              8598
      n=1000, nnz=10000, coalesce=((True, False))      |            100          |              8800
      n=1000, nnz=10000, coalesce=((False, True))      |            100          |              8782
      n=1000, nnz=10000, coalesce=((False, False))     |            255          |              8956
      n=1000, nnz=100000, coalesce=((True, True))      |            120          |             84500
      n=1000, nnz=100000, coalesce=((True, False))     |            200          |             88560
      n=1000, nnz=100000, coalesce=((False, True))     |            160          |             89000
      n=1000, nnz=100000, coalesce=((False, False))    |            373          |             89000
      n=1000, nnz=1000000, coalesce=((True, True))     |            312          |            606400
      n=1000, nnz=1000000, coalesce=((True, False))    |           1340          |            609200
      n=1000, nnz=1000000, coalesce=((False, True))    |           1340          |            609100
      n=1000, nnz=1000000, coalesce=((False, False))   |           4408          |            611400

Times are in microseconds (us).
```

</details>

<details>

<summary>CPU</summary>

```
[------------------------------------------------ coo.mul ------------------------------------------------]
                                                       |  PR: mul, device: cpu  |  master: mul, device: cpu
24 threads: -----------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |              8         |                8
      n=10000, nnz=100, coalesce=((True, False))       |             32         |               34
      n=10000, nnz=100, coalesce=((False, True))       |             32         |               34
      n=10000, nnz=100, coalesce=((False, False))      |             41         |               56
      n=100000, nnz=1000, coalesce=((True, True))      |             24         |               24
      n=100000, nnz=1000, coalesce=((True, False))     |             90         |              100
      n=100000, nnz=1000, coalesce=((False, True))     |             87         |              100
      n=100000, nnz=1000, coalesce=((False, False))    |            231         |              255
      n=1000000, nnz=10000, coalesce=((True, True))    |            190         |              200
      n=1000000, nnz=10000, coalesce=((True, False))   |            908         |             2023
      n=1000000, nnz=10000, coalesce=((False, True))   |            800         |             2036
      n=1000000, nnz=10000, coalesce=((False, False))  |           3684         |             3989
      n=10, nnz=100, coalesce=((True, True))           |              8         |                7
      n=10, nnz=100, coalesce=((True, False))          |             34         |               30
      n=10, nnz=100, coalesce=((False, True))          |             33         |               30
      n=10, nnz=100, coalesce=((False, False))         |             44         |               50
      n=10, nnz=1000, coalesce=((True, True))          |              8         |                7
      n=10, nnz=1000, coalesce=((True, False))         |            100         |              100
      n=10, nnz=1000, coalesce=((False, True))         |            130         |              100
      n=10, nnz=1000, coalesce=((False, False))        |            746         |              210
      n=10, nnz=10000, coalesce=((True, True))         |              8         |                7
      n=10, nnz=10000, coalesce=((True, False))        |           1000         |             1500
      n=10, nnz=10000, coalesce=((False, True))        |           1000         |             1510
      n=10, nnz=10000, coalesce=((False, False))       |           3063         |             2457
      n=100, nnz=1000, coalesce=((True, True))         |             25         |               25
      n=100, nnz=1000, coalesce=((True, False))        |            180         |              130
      n=100, nnz=1000, coalesce=((False, True))        |            200         |              130
      n=100, nnz=1000, coalesce=((False, False))       |            271         |              255
      n=100, nnz=10000, coalesce=((True, True))        |            100         |              100
      n=100, nnz=10000, coalesce=((True, False))       |           2444         |             2290
      n=100, nnz=10000, coalesce=((False, True))       |           2455         |             2357
      n=100, nnz=10000, coalesce=((False, False))      |           5316         |             3783
      n=1000, nnz=10000, coalesce=((True, True))       |            204         |              211
      n=1000, nnz=10000, coalesce=((True, False))      |           2457         |             2480
      n=1000, nnz=10000, coalesce=((False, True))      |           2448         |             2539
      n=1000, nnz=10000, coalesce=((False, False))     |           3665         |             4801
      n=1000, nnz=100000, coalesce=((True, True))      |           2293         |             2374
      n=1000, nnz=100000, coalesce=((True, False))     |           9000         |            24620
      n=1000, nnz=100000, coalesce=((False, True))     |           8000         |            25080
      n=1000, nnz=100000, coalesce=((False, False))    |          26500         |            47650
      n=1000, nnz=1000000, coalesce=((True, True))     |          10000         |            13000
      n=1000, nnz=1000000, coalesce=((True, False))    |          80000         |           362200
      n=1000, nnz=1000000, coalesce=((False, True))    |          78050         |           392600
      n=1000, nnz=1000000, coalesce=((False, False))   |         312100         |           766900

Times are in microseconds (us).
```

</details>

Pull Request resolved: #83428
Approved by: https://github.com/cpuhrsch
@nikitaved nikitaved deleted the gh/nikitaved/2/head branch September 20, 2022 13:18
pytorchmergebot pushed a commit that referenced this pull request Sep 23, 2022
This is a combo PR of #84929 and ~#83428.

Preliminary benchmarks (square matrices of shape (n, n)).

<details>

<summary>Script</summary>

```python
import torch
import math
from IPython import get_ipython
from itertools import product, repeat
import pickle
from torch.utils.benchmark import Timer, Compare

torch.manual_seed(13)

problem_dims = (
    # n > nnz
    (10000, 100),
    (100000, 1000),
    (1000000, 10000),
    # n < nnz
    (10, 100),
    (10, 1000),
    (10, 10000),
    (100, 1000),
    (100, 10000),
    (1000, 10000),
    (1000, 100000),
    (1000, 1000000),
    #(1000000, 1000000000),
)

name = "PR"
device = "cuda"
results = []

for n, nnz in problem_dims:
    def gen_tensor(coalesce=False):
        shape = (n, n)
        nrows, ncols = shape
        rowidx = torch.randint(low=0, high=nrows, size=(nnz,), device=device)
        colidx = torch.randint(low=0, high=ncols, size=(nnz,), device=device)
        itemidx = torch.vstack((rowidx, colidx))
        xvalues = torch.randn(nnz, device=device)
        itemidx = torch.hstack((itemidx, itemidx))
        xvalues = torch.hstack((xvalues, xvalues))
        res = torch.sparse_coo_tensor(itemidx, xvalues, size=shape)
        if coalesce:
            return res.coalesce()
        else:
            return res

    for x_coalesce, y_coalesce in product(*repeat((True, False), 2)):
        x = gen_tensor(x_coalesce)
        y = gen_tensor(y_coalesce)
        smtp = "x * y"
        timer = Timer(smtp,
                      globals=globals(),
                      label="coo.mul",
                      description=f"{name}: mul, device: {device}",
                      sub_label=f"n={n}, nnz={nnz}, coalesce=({x_coalesce, y_coalesce})",
                      num_threads=torch.get_num_threads())
        results.append(timer.blocked_autorange())

compare = Compare(results)
compare.trim_significant_figures()
compare.print()

with open(f"{name}_{device}_mul.pickle", 'wb') as f:
    pickle.dump(results, f)

```

</details>

<details>

<summary>Gather results</summary>

```python
import pickle
from torch.utils.benchmark import Timer, Compare

files = [
        "PR",
        "master"
        ]

device = 'cuda'

timers = []
for name in files:
    with open("{}_{}_mul.pickle".format(name, device), 'rb') as f:
        timers += pickle.load(f)

compare = Compare(timers)
compare.trim_significant_figures()
compare.print()

```

</details>

<details>

<summary>CUDA</summary>

```
[------------------------------------------------- coo.mul -------------------------------------------------]
                                                       |  PR: mul, device: cuda  |  master: mul, device: cuda
24 threads: -------------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |             95          |                91
      n=10000, nnz=100, coalesce=((True, False))       |             87          |               242
      n=10000, nnz=100, coalesce=((False, True))       |             87          |               226
      n=10000, nnz=100, coalesce=((False, False))      |            130          |               371
      n=100000, nnz=1000, coalesce=((True, True))      |            100          |               521
      n=100000, nnz=1000, coalesce=((True, False))     |             90          |               649
      n=100000, nnz=1000, coalesce=((False, True))     |            100          |               659
      n=100000, nnz=1000, coalesce=((False, False))    |            200          |               781
      n=1000000, nnz=10000, coalesce=((True, True))    |            100          |              4861
      n=1000000, nnz=10000, coalesce=((True, False))   |            100          |              5012
      n=1000000, nnz=10000, coalesce=((False, True))   |             98          |              5010
      n=1000000, nnz=10000, coalesce=((False, False))  |            384          |              5174
      n=10, nnz=100, coalesce=((True, True))           |            100          |                79
      n=10, nnz=100, coalesce=((True, False))          |            100          |               221
      n=10, nnz=100, coalesce=((False, True))          |            100          |               221
      n=10, nnz=100, coalesce=((False, False))         |            100          |               350
      n=10, nnz=1000, coalesce=((True, True))          |            100          |               100
      n=10, nnz=1000, coalesce=((True, False))         |            100          |               240
      n=10, nnz=1000, coalesce=((False, True))         |            100          |               254
      n=10, nnz=1000, coalesce=((False, False))        |            100          |               392
      n=10, nnz=10000, coalesce=((True, True))         |            100          |               110
      n=10, nnz=10000, coalesce=((True, False))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, True))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, False))       |            271          |               455
      n=100, nnz=1000, coalesce=((True, True))         |            110          |               851
      n=100, nnz=1000, coalesce=((True, False))        |            110          |              1000
      n=100, nnz=1000, coalesce=((False, True))        |            110          |               990
      n=100, nnz=1000, coalesce=((False, False))       |            140          |              1124
      n=100, nnz=10000, coalesce=((True, True))        |            110          |              5137
      n=100, nnz=10000, coalesce=((True, False))       |            110          |              5391
      n=100, nnz=10000, coalesce=((False, True))       |            100          |              5405
      n=100, nnz=10000, coalesce=((False, False))      |            249          |              5539
      n=1000, nnz=10000, coalesce=((True, True))       |            100          |              8598
      n=1000, nnz=10000, coalesce=((True, False))      |            100          |              8800
      n=1000, nnz=10000, coalesce=((False, True))      |            100          |              8782
      n=1000, nnz=10000, coalesce=((False, False))     |            255          |              8956
      n=1000, nnz=100000, coalesce=((True, True))      |            120          |             84500
      n=1000, nnz=100000, coalesce=((True, False))     |            200          |             88560
      n=1000, nnz=100000, coalesce=((False, True))     |            160          |             89000
      n=1000, nnz=100000, coalesce=((False, False))    |            373          |             89000
      n=1000, nnz=1000000, coalesce=((True, True))     |            312          |            606400
      n=1000, nnz=1000000, coalesce=((True, False))    |           1340          |            609200
      n=1000, nnz=1000000, coalesce=((False, True))    |           1340          |            609100
      n=1000, nnz=1000000, coalesce=((False, False))   |           4408          |            611400

Times are in microseconds (us).
```

</details>

<details>

<summary>CPU</summary>

```
[------------------------------------------------ coo.mul ------------------------------------------------]
                                                       |  PR: mul, device: cpu  |  master: mul, device: cpu
24 threads: -----------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |              8         |                8
      n=10000, nnz=100, coalesce=((True, False))       |             32         |               34
      n=10000, nnz=100, coalesce=((False, True))       |             32         |               34
      n=10000, nnz=100, coalesce=((False, False))      |             41         |               56
      n=100000, nnz=1000, coalesce=((True, True))      |             24         |               24
      n=100000, nnz=1000, coalesce=((True, False))     |             90         |              100
      n=100000, nnz=1000, coalesce=((False, True))     |             87         |              100
      n=100000, nnz=1000, coalesce=((False, False))    |            231         |              255
      n=1000000, nnz=10000, coalesce=((True, True))    |            190         |              200
      n=1000000, nnz=10000, coalesce=((True, False))   |            908         |             2023
      n=1000000, nnz=10000, coalesce=((False, True))   |            800         |             2036
      n=1000000, nnz=10000, coalesce=((False, False))  |           3684         |             3989
      n=10, nnz=100, coalesce=((True, True))           |              8         |                7
      n=10, nnz=100, coalesce=((True, False))          |             34         |               30
      n=10, nnz=100, coalesce=((False, True))          |             33         |               30
      n=10, nnz=100, coalesce=((False, False))         |             44         |               50
      n=10, nnz=1000, coalesce=((True, True))          |              8         |                7
      n=10, nnz=1000, coalesce=((True, False))         |            100         |              100
      n=10, nnz=1000, coalesce=((False, True))         |            130         |              100
      n=10, nnz=1000, coalesce=((False, False))        |            746         |              210
      n=10, nnz=10000, coalesce=((True, True))         |              8         |                7
      n=10, nnz=10000, coalesce=((True, False))        |           1000         |             1500
      n=10, nnz=10000, coalesce=((False, True))        |           1000         |             1510
      n=10, nnz=10000, coalesce=((False, False))       |           3063         |             2457
      n=100, nnz=1000, coalesce=((True, True))         |             25         |               25
      n=100, nnz=1000, coalesce=((True, False))        |            180         |              130
      n=100, nnz=1000, coalesce=((False, True))        |            200         |              130
      n=100, nnz=1000, coalesce=((False, False))       |            271         |              255
      n=100, nnz=10000, coalesce=((True, True))        |            100         |              100
      n=100, nnz=10000, coalesce=((True, False))       |           2444         |             2290
      n=100, nnz=10000, coalesce=((False, True))       |           2455         |             2357
      n=100, nnz=10000, coalesce=((False, False))      |           5316         |             3783
      n=1000, nnz=10000, coalesce=((True, True))       |            204         |              211
      n=1000, nnz=10000, coalesce=((True, False))      |           2457         |             2480
      n=1000, nnz=10000, coalesce=((False, True))      |           2448         |             2539
      n=1000, nnz=10000, coalesce=((False, False))     |           3665         |             4801
      n=1000, nnz=100000, coalesce=((True, True))      |           2293         |             2374
      n=1000, nnz=100000, coalesce=((True, False))     |           9000         |            24620
      n=1000, nnz=100000, coalesce=((False, True))     |           8000         |            25080
      n=1000, nnz=100000, coalesce=((False, False))    |          26500         |            47650
      n=1000, nnz=1000000, coalesce=((True, True))     |          10000         |            13000
      n=1000, nnz=1000000, coalesce=((True, False))    |          80000         |           362200
      n=1000, nnz=1000000, coalesce=((False, True))    |          78050         |           392600
      n=1000, nnz=1000000, coalesce=((False, False))   |         312100         |           766900

Times are in microseconds (us).
```

</details>

Pull Request resolved: #85336
Approved by: https://github.com/cpuhrsch
pytorchmergebot pushed a commit that referenced this pull request Sep 23, 2022
This is a combo PR of #84929 and ~#83428.

Preliminary benchmarks (square matrices of shape (n, n)).

<details>

<summary>Script</summary>

```python
import torch
import math
from IPython import get_ipython
from itertools import product, repeat
import pickle
from torch.utils.benchmark import Timer, Compare

torch.manual_seed(13)

problem_dims = (
    # n > nnz
    (10000, 100),
    (100000, 1000),
    (1000000, 10000),
    # n < nnz
    (10, 100),
    (10, 1000),
    (10, 10000),
    (100, 1000),
    (100, 10000),
    (1000, 10000),
    (1000, 100000),
    (1000, 1000000),
    #(1000000, 1000000000),
)

name = "PR"
device = "cuda"
results = []

for n, nnz in problem_dims:
    def gen_tensor(coalesce=False):
        shape = (n, n)
        nrows, ncols = shape
        rowidx = torch.randint(low=0, high=nrows, size=(nnz,), device=device)
        colidx = torch.randint(low=0, high=ncols, size=(nnz,), device=device)
        itemidx = torch.vstack((rowidx, colidx))
        xvalues = torch.randn(nnz, device=device)
        itemidx = torch.hstack((itemidx, itemidx))
        xvalues = torch.hstack((xvalues, xvalues))
        res = torch.sparse_coo_tensor(itemidx, xvalues, size=shape)
        if coalesce:
            return res.coalesce()
        else:
            return res

    for x_coalesce, y_coalesce in product(*repeat((True, False), 2)):
        x = gen_tensor(x_coalesce)
        y = gen_tensor(y_coalesce)
        smtp = "x * y"
        timer = Timer(smtp,
                      globals=globals(),
                      label="coo.mul",
                      description=f"{name}: mul, device: {device}",
                      sub_label=f"n={n}, nnz={nnz}, coalesce=({x_coalesce, y_coalesce})",
                      num_threads=torch.get_num_threads())
        results.append(timer.blocked_autorange())

compare = Compare(results)
compare.trim_significant_figures()
compare.print()

with open(f"{name}_{device}_mul.pickle", 'wb') as f:
    pickle.dump(results, f)

```

</details>

<details>

<summary>Gather results</summary>

```python
import pickle
from torch.utils.benchmark import Timer, Compare

files = [
        "PR",
        "master"
        ]

device = 'cuda'

timers = []
for name in files:
    with open("{}_{}_mul.pickle".format(name, device), 'rb') as f:
        timers += pickle.load(f)

compare = Compare(timers)
compare.trim_significant_figures()
compare.print()

```

</details>

<details>

<summary>CUDA</summary>

```
[------------------------------------------------- coo.mul -------------------------------------------------]
                                                       |  PR: mul, device: cuda  |  master: mul, device: cuda
24 threads: -------------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |             95          |                91
      n=10000, nnz=100, coalesce=((True, False))       |             87          |               242
      n=10000, nnz=100, coalesce=((False, True))       |             87          |               226
      n=10000, nnz=100, coalesce=((False, False))      |            130          |               371
      n=100000, nnz=1000, coalesce=((True, True))      |            100          |               521
      n=100000, nnz=1000, coalesce=((True, False))     |             90          |               649
      n=100000, nnz=1000, coalesce=((False, True))     |            100          |               659
      n=100000, nnz=1000, coalesce=((False, False))    |            200          |               781
      n=1000000, nnz=10000, coalesce=((True, True))    |            100          |              4861
      n=1000000, nnz=10000, coalesce=((True, False))   |            100          |              5012
      n=1000000, nnz=10000, coalesce=((False, True))   |             98          |              5010
      n=1000000, nnz=10000, coalesce=((False, False))  |            384          |              5174
      n=10, nnz=100, coalesce=((True, True))           |            100          |                79
      n=10, nnz=100, coalesce=((True, False))          |            100          |               221
      n=10, nnz=100, coalesce=((False, True))          |            100          |               221
      n=10, nnz=100, coalesce=((False, False))         |            100          |               350
      n=10, nnz=1000, coalesce=((True, True))          |            100          |               100
      n=10, nnz=1000, coalesce=((True, False))         |            100          |               240
      n=10, nnz=1000, coalesce=((False, True))         |            100          |               254
      n=10, nnz=1000, coalesce=((False, False))        |            100          |               392
      n=10, nnz=10000, coalesce=((True, True))         |            100          |               110
      n=10, nnz=10000, coalesce=((True, False))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, True))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, False))       |            271          |               455
      n=100, nnz=1000, coalesce=((True, True))         |            110          |               851
      n=100, nnz=1000, coalesce=((True, False))        |            110          |              1000
      n=100, nnz=1000, coalesce=((False, True))        |            110          |               990
      n=100, nnz=1000, coalesce=((False, False))       |            140          |              1124
      n=100, nnz=10000, coalesce=((True, True))        |            110          |              5137
      n=100, nnz=10000, coalesce=((True, False))       |            110          |              5391
      n=100, nnz=10000, coalesce=((False, True))       |            100          |              5405
      n=100, nnz=10000, coalesce=((False, False))      |            249          |              5539
      n=1000, nnz=10000, coalesce=((True, True))       |            100          |              8598
      n=1000, nnz=10000, coalesce=((True, False))      |            100          |              8800
      n=1000, nnz=10000, coalesce=((False, True))      |            100          |              8782
      n=1000, nnz=10000, coalesce=((False, False))     |            255          |              8956
      n=1000, nnz=100000, coalesce=((True, True))      |            120          |             84500
      n=1000, nnz=100000, coalesce=((True, False))     |            200          |             88560
      n=1000, nnz=100000, coalesce=((False, True))     |            160          |             89000
      n=1000, nnz=100000, coalesce=((False, False))    |            373          |             89000
      n=1000, nnz=1000000, coalesce=((True, True))     |            312          |            606400
      n=1000, nnz=1000000, coalesce=((True, False))    |           1340          |            609200
      n=1000, nnz=1000000, coalesce=((False, True))    |           1340          |            609100
      n=1000, nnz=1000000, coalesce=((False, False))   |           4408          |            611400

Times are in microseconds (us).
```

</details>

<details>

<summary>CPU</summary>

```
[------------------------------------------------ coo.mul ------------------------------------------------]
                                                       |  PR: mul, device: cpu  |  master: mul, device: cpu
24 threads: -----------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |              8         |                8
      n=10000, nnz=100, coalesce=((True, False))       |             32         |               34
      n=10000, nnz=100, coalesce=((False, True))       |             32         |               34
      n=10000, nnz=100, coalesce=((False, False))      |             41         |               56
      n=100000, nnz=1000, coalesce=((True, True))      |             24         |               24
      n=100000, nnz=1000, coalesce=((True, False))     |             90         |              100
      n=100000, nnz=1000, coalesce=((False, True))     |             87         |              100
      n=100000, nnz=1000, coalesce=((False, False))    |            231         |              255
      n=1000000, nnz=10000, coalesce=((True, True))    |            190         |              200
      n=1000000, nnz=10000, coalesce=((True, False))   |            908         |             2023
      n=1000000, nnz=10000, coalesce=((False, True))   |            800         |             2036
      n=1000000, nnz=10000, coalesce=((False, False))  |           3684         |             3989
      n=10, nnz=100, coalesce=((True, True))           |              8         |                7
      n=10, nnz=100, coalesce=((True, False))          |             34         |               30
      n=10, nnz=100, coalesce=((False, True))          |             33         |               30
      n=10, nnz=100, coalesce=((False, False))         |             44         |               50
      n=10, nnz=1000, coalesce=((True, True))          |              8         |                7
      n=10, nnz=1000, coalesce=((True, False))         |            100         |              100
      n=10, nnz=1000, coalesce=((False, True))         |            130         |              100
      n=10, nnz=1000, coalesce=((False, False))        |            746         |              210
      n=10, nnz=10000, coalesce=((True, True))         |              8         |                7
      n=10, nnz=10000, coalesce=((True, False))        |           1000         |             1500
      n=10, nnz=10000, coalesce=((False, True))        |           1000         |             1510
      n=10, nnz=10000, coalesce=((False, False))       |           3063         |             2457
      n=100, nnz=1000, coalesce=((True, True))         |             25         |               25
      n=100, nnz=1000, coalesce=((True, False))        |            180         |              130
      n=100, nnz=1000, coalesce=((False, True))        |            200         |              130
      n=100, nnz=1000, coalesce=((False, False))       |            271         |              255
      n=100, nnz=10000, coalesce=((True, True))        |            100         |              100
      n=100, nnz=10000, coalesce=((True, False))       |           2444         |             2290
      n=100, nnz=10000, coalesce=((False, True))       |           2455         |             2357
      n=100, nnz=10000, coalesce=((False, False))      |           5316         |             3783
      n=1000, nnz=10000, coalesce=((True, True))       |            204         |              211
      n=1000, nnz=10000, coalesce=((True, False))      |           2457         |             2480
      n=1000, nnz=10000, coalesce=((False, True))      |           2448         |             2539
      n=1000, nnz=10000, coalesce=((False, False))     |           3665         |             4801
      n=1000, nnz=100000, coalesce=((True, True))      |           2293         |             2374
      n=1000, nnz=100000, coalesce=((True, False))     |           9000         |            24620
      n=1000, nnz=100000, coalesce=((False, True))     |           8000         |            25080
      n=1000, nnz=100000, coalesce=((False, False))    |          26500         |            47650
      n=1000, nnz=1000000, coalesce=((True, True))     |          10000         |            13000
      n=1000, nnz=1000000, coalesce=((True, False))    |          80000         |           362200
      n=1000, nnz=1000000, coalesce=((False, True))    |          78050         |           392600
      n=1000, nnz=1000000, coalesce=((False, False))   |         312100         |           766900

Times are in microseconds (us).
```

</details>

Pull Request resolved: #85336
Approved by: https://github.com/cpuhrsch
mehtanirav pushed a commit that referenced this pull request Oct 4, 2022
Preliminary benchmarks (square matrices of shape (n, n)).

<details>

<summary>Script</summary>

```python
import torch
import math
from IPython import get_ipython
from itertools import product, repeat
import pickle
from torch.utils.benchmark import Timer, Compare

torch.manual_seed(13)

# specifies (n, nnz)
problem_dims = (
    # n > nnz
    (10000, 100),
    (100000, 1000),
    (1000000, 10000),
    # n < nnz
    (10, 100),
    (10, 1000),
    (10, 10000),
    (100, 1000),
    (100, 10000),
    (1000, 10000),
    (1000, 100000),
    (1000, 1000000),
    #(1000000, 1000000000),
)

name = "PR"
device = "cuda"
results = []

for n, nnz in problem_dims:
    def gen_tensor(coalesce=False):
        shape = (n, n)
        nrows, ncols = shape
        rowidx = torch.randint(low=0, high=nrows, size=(nnz,), device=device)
        colidx = torch.randint(low=0, high=ncols, size=(nnz,), device=device)
        itemidx = torch.vstack((rowidx, colidx))
        xvalues = torch.randn(nnz, device=device)
        itemidx = torch.hstack((itemidx, itemidx))
        xvalues = torch.hstack((xvalues, xvalues))
        res = torch.sparse_coo_tensor(itemidx, xvalues, size=shape)
        if coalesce:
            return res.coalesce()
        else:
            return res

    for x_coalesce, y_coalesce in product(*repeat((True, False), 2)):
        x = gen_tensor(x_coalesce)
        y = gen_tensor(y_coalesce)
        smtp = "x * y"
        timer = Timer(smtp,
                      globals=globals(),
                      label="coo.mul",
                      description=f"{name}: mul, device: {device}",
                      sub_label=f"n={n}, nnz={nnz}, coalesce=({x_coalesce, y_coalesce})",
                      num_threads=torch.get_num_threads())
        results.append(timer.blocked_autorange())

compare = Compare(results)
compare.trim_significant_figures()
compare.print()

with open(f"{name}_{device}_mul.pickle", 'wb') as f:
    pickle.dump(results, f)

```

</details>

<details>

<summary>Gather results</summary>

```python
import pickle
from torch.utils.benchmark import Timer, Compare

files = [
        "PR",
        "master"
        ]

device = 'cuda'

timers = []
for name in files:
    with open("{}_{}_mul.pickle".format(name, device), 'rb') as f:
        timers += pickle.load(f)

compare = Compare(timers)
compare.trim_significant_figures()
compare.print()

```

</details>

<details>

<summary>CUDA</summary>

```
[------------------------------------------------- coo.mul -------------------------------------------------]
                                                       |  PR: mul, device: cuda  |  master: mul, device: cuda
24 threads: -------------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |             95          |                91
      n=10000, nnz=100, coalesce=((True, False))       |             87          |               242
      n=10000, nnz=100, coalesce=((False, True))       |             87          |               226
      n=10000, nnz=100, coalesce=((False, False))      |            130          |               371
      n=100000, nnz=1000, coalesce=((True, True))      |            100          |               521
      n=100000, nnz=1000, coalesce=((True, False))     |             90          |               649
      n=100000, nnz=1000, coalesce=((False, True))     |            100          |               659
      n=100000, nnz=1000, coalesce=((False, False))    |            200          |               781
      n=1000000, nnz=10000, coalesce=((True, True))    |            100          |              4861
      n=1000000, nnz=10000, coalesce=((True, False))   |            100          |              5012
      n=1000000, nnz=10000, coalesce=((False, True))   |             98          |              5010
      n=1000000, nnz=10000, coalesce=((False, False))  |            384          |              5174
      n=10, nnz=100, coalesce=((True, True))           |            100          |                79
      n=10, nnz=100, coalesce=((True, False))          |            100          |               221
      n=10, nnz=100, coalesce=((False, True))          |            100          |               221
      n=10, nnz=100, coalesce=((False, False))         |            100          |               350
      n=10, nnz=1000, coalesce=((True, True))          |            100          |               100
      n=10, nnz=1000, coalesce=((True, False))         |            100          |               240
      n=10, nnz=1000, coalesce=((False, True))         |            100          |               254
      n=10, nnz=1000, coalesce=((False, False))        |            100          |               392
      n=10, nnz=10000, coalesce=((True, True))         |            100          |               110
      n=10, nnz=10000, coalesce=((True, False))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, True))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, False))       |            271          |               455
      n=100, nnz=1000, coalesce=((True, True))         |            110          |               851
      n=100, nnz=1000, coalesce=((True, False))        |            110          |              1000
      n=100, nnz=1000, coalesce=((False, True))        |            110          |               990
      n=100, nnz=1000, coalesce=((False, False))       |            140          |              1124
      n=100, nnz=10000, coalesce=((True, True))        |            110          |              5137
      n=100, nnz=10000, coalesce=((True, False))       |            110          |              5391
      n=100, nnz=10000, coalesce=((False, True))       |            100          |              5405
      n=100, nnz=10000, coalesce=((False, False))      |            249          |              5539
      n=1000, nnz=10000, coalesce=((True, True))       |            100          |              8598
      n=1000, nnz=10000, coalesce=((True, False))      |            100          |              8800
      n=1000, nnz=10000, coalesce=((False, True))      |            100          |              8782
      n=1000, nnz=10000, coalesce=((False, False))     |            255          |              8956
      n=1000, nnz=100000, coalesce=((True, True))      |            120          |             84500
      n=1000, nnz=100000, coalesce=((True, False))     |            200          |             88560
      n=1000, nnz=100000, coalesce=((False, True))     |            160          |             89000
      n=1000, nnz=100000, coalesce=((False, False))    |            373          |             89000
      n=1000, nnz=1000000, coalesce=((True, True))     |            312          |            606400
      n=1000, nnz=1000000, coalesce=((True, False))    |           1340          |            609200
      n=1000, nnz=1000000, coalesce=((False, True))    |           1340          |            609100
      n=1000, nnz=1000000, coalesce=((False, False))   |           4408          |            611400

Times are in microseconds (us).
```

</details>

<details>

<summary>CPU</summary>

```
[------------------------------------------------ coo.mul ------------------------------------------------]
                                                       |  PR: mul, device: cpu  |  master: mul, device: cpu
24 threads: -----------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |              8         |                8
      n=10000, nnz=100, coalesce=((True, False))       |             32         |               34
      n=10000, nnz=100, coalesce=((False, True))       |             32         |               34
      n=10000, nnz=100, coalesce=((False, False))      |             41         |               56
      n=100000, nnz=1000, coalesce=((True, True))      |             24         |               24
      n=100000, nnz=1000, coalesce=((True, False))     |             90         |              100
      n=100000, nnz=1000, coalesce=((False, True))     |             87         |              100
      n=100000, nnz=1000, coalesce=((False, False))    |            231         |              255
      n=1000000, nnz=10000, coalesce=((True, True))    |            190         |              200
      n=1000000, nnz=10000, coalesce=((True, False))   |            908         |             2023
      n=1000000, nnz=10000, coalesce=((False, True))   |            800         |             2036
      n=1000000, nnz=10000, coalesce=((False, False))  |           3684         |             3989
      n=10, nnz=100, coalesce=((True, True))           |              8         |                7
      n=10, nnz=100, coalesce=((True, False))          |             34         |               30
      n=10, nnz=100, coalesce=((False, True))          |             33         |               30
      n=10, nnz=100, coalesce=((False, False))         |             44         |               50
      n=10, nnz=1000, coalesce=((True, True))          |              8         |                7
      n=10, nnz=1000, coalesce=((True, False))         |            100         |              100
      n=10, nnz=1000, coalesce=((False, True))         |            130         |              100
      n=10, nnz=1000, coalesce=((False, False))        |            746         |              210
      n=10, nnz=10000, coalesce=((True, True))         |              8         |                7
      n=10, nnz=10000, coalesce=((True, False))        |           1000         |             1500
      n=10, nnz=10000, coalesce=((False, True))        |           1000         |             1510
      n=10, nnz=10000, coalesce=((False, False))       |           3063         |             2457
      n=100, nnz=1000, coalesce=((True, True))         |             25         |               25
      n=100, nnz=1000, coalesce=((True, False))        |            180         |              130
      n=100, nnz=1000, coalesce=((False, True))        |            200         |              130
      n=100, nnz=1000, coalesce=((False, False))       |            271         |              255
      n=100, nnz=10000, coalesce=((True, True))        |            100         |              100
      n=100, nnz=10000, coalesce=((True, False))       |           2444         |             2290
      n=100, nnz=10000, coalesce=((False, True))       |           2455         |             2357
      n=100, nnz=10000, coalesce=((False, False))      |           5316         |             3783
      n=1000, nnz=10000, coalesce=((True, True))       |            204         |              211
      n=1000, nnz=10000, coalesce=((True, False))      |           2457         |             2480
      n=1000, nnz=10000, coalesce=((False, True))      |           2448         |             2539
      n=1000, nnz=10000, coalesce=((False, False))     |           3665         |             4801
      n=1000, nnz=100000, coalesce=((True, True))      |           2293         |             2374
      n=1000, nnz=100000, coalesce=((True, False))     |           9000         |            24620
      n=1000, nnz=100000, coalesce=((False, True))     |           8000         |            25080
      n=1000, nnz=100000, coalesce=((False, False))    |          26500         |            47650
      n=1000, nnz=1000000, coalesce=((True, True))     |          10000         |            13000
      n=1000, nnz=1000000, coalesce=((True, False))    |          80000         |           362200
      n=1000, nnz=1000000, coalesce=((False, True))    |          78050         |           392600
      n=1000, nnz=1000000, coalesce=((False, False))   |         312100         |           766900

Times are in microseconds (us).
```

</details>

Pull Request resolved: #83428
Approved by: https://github.com/cpuhrsch
mehtanirav pushed a commit that referenced this pull request Oct 4, 2022
…83428)"

This reverts commit d49943b.

Reverted #83428 on behalf of https://github.com/osalpekar due to Reverted because __restrict symbol not supported by certain MSVC compilers, leading to undefined symbol error at compilation time
mehtanirav pushed a commit that referenced this pull request Oct 4, 2022
This is a combo PR of #84929 and ~#83428.

Preliminary benchmarks (square matrices of shape (n, n)).

<details>

<summary>Script</summary>

```python
import torch
import math
from IPython import get_ipython
from itertools import product, repeat
import pickle
from torch.utils.benchmark import Timer, Compare

torch.manual_seed(13)

problem_dims = (
    # n > nnz
    (10000, 100),
    (100000, 1000),
    (1000000, 10000),
    # n < nnz
    (10, 100),
    (10, 1000),
    (10, 10000),
    (100, 1000),
    (100, 10000),
    (1000, 10000),
    (1000, 100000),
    (1000, 1000000),
    #(1000000, 1000000000),
)

name = "PR"
device = "cuda"
results = []

for n, nnz in problem_dims:
    def gen_tensor(coalesce=False):
        shape = (n, n)
        nrows, ncols = shape
        rowidx = torch.randint(low=0, high=nrows, size=(nnz,), device=device)
        colidx = torch.randint(low=0, high=ncols, size=(nnz,), device=device)
        itemidx = torch.vstack((rowidx, colidx))
        xvalues = torch.randn(nnz, device=device)
        itemidx = torch.hstack((itemidx, itemidx))
        xvalues = torch.hstack((xvalues, xvalues))
        res = torch.sparse_coo_tensor(itemidx, xvalues, size=shape)
        if coalesce:
            return res.coalesce()
        else:
            return res

    for x_coalesce, y_coalesce in product(*repeat((True, False), 2)):
        x = gen_tensor(x_coalesce)
        y = gen_tensor(y_coalesce)
        smtp = "x * y"
        timer = Timer(smtp,
                      globals=globals(),
                      label="coo.mul",
                      description=f"{name}: mul, device: {device}",
                      sub_label=f"n={n}, nnz={nnz}, coalesce=({x_coalesce, y_coalesce})",
                      num_threads=torch.get_num_threads())
        results.append(timer.blocked_autorange())

compare = Compare(results)
compare.trim_significant_figures()
compare.print()

with open(f"{name}_{device}_mul.pickle", 'wb') as f:
    pickle.dump(results, f)

```

</details>

<details>

<summary>Gather results</summary>

```python
import pickle
from torch.utils.benchmark import Timer, Compare

files = [
        "PR",
        "master"
        ]

device = 'cuda'

timers = []
for name in files:
    with open("{}_{}_mul.pickle".format(name, device), 'rb') as f:
        timers += pickle.load(f)

compare = Compare(timers)
compare.trim_significant_figures()
compare.print()

```

</details>

<details>

<summary>CUDA</summary>

```
[------------------------------------------------- coo.mul -------------------------------------------------]
                                                       |  PR: mul, device: cuda  |  master: mul, device: cuda
24 threads: -------------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |             95          |                91
      n=10000, nnz=100, coalesce=((True, False))       |             87          |               242
      n=10000, nnz=100, coalesce=((False, True))       |             87          |               226
      n=10000, nnz=100, coalesce=((False, False))      |            130          |               371
      n=100000, nnz=1000, coalesce=((True, True))      |            100          |               521
      n=100000, nnz=1000, coalesce=((True, False))     |             90          |               649
      n=100000, nnz=1000, coalesce=((False, True))     |            100          |               659
      n=100000, nnz=1000, coalesce=((False, False))    |            200          |               781
      n=1000000, nnz=10000, coalesce=((True, True))    |            100          |              4861
      n=1000000, nnz=10000, coalesce=((True, False))   |            100          |              5012
      n=1000000, nnz=10000, coalesce=((False, True))   |             98          |              5010
      n=1000000, nnz=10000, coalesce=((False, False))  |            384          |              5174
      n=10, nnz=100, coalesce=((True, True))           |            100          |                79
      n=10, nnz=100, coalesce=((True, False))          |            100          |               221
      n=10, nnz=100, coalesce=((False, True))          |            100          |               221
      n=10, nnz=100, coalesce=((False, False))         |            100          |               350
      n=10, nnz=1000, coalesce=((True, True))          |            100          |               100
      n=10, nnz=1000, coalesce=((True, False))         |            100          |               240
      n=10, nnz=1000, coalesce=((False, True))         |            100          |               254
      n=10, nnz=1000, coalesce=((False, False))        |            100          |               392
      n=10, nnz=10000, coalesce=((True, True))         |            100          |               110
      n=10, nnz=10000, coalesce=((True, False))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, True))        |            110          |               286
      n=10, nnz=10000, coalesce=((False, False))       |            271          |               455
      n=100, nnz=1000, coalesce=((True, True))         |            110          |               851
      n=100, nnz=1000, coalesce=((True, False))        |            110          |              1000
      n=100, nnz=1000, coalesce=((False, True))        |            110          |               990
      n=100, nnz=1000, coalesce=((False, False))       |            140          |              1124
      n=100, nnz=10000, coalesce=((True, True))        |            110          |              5137
      n=100, nnz=10000, coalesce=((True, False))       |            110          |              5391
      n=100, nnz=10000, coalesce=((False, True))       |            100          |              5405
      n=100, nnz=10000, coalesce=((False, False))      |            249          |              5539
      n=1000, nnz=10000, coalesce=((True, True))       |            100          |              8598
      n=1000, nnz=10000, coalesce=((True, False))      |            100          |              8800
      n=1000, nnz=10000, coalesce=((False, True))      |            100          |              8782
      n=1000, nnz=10000, coalesce=((False, False))     |            255          |              8956
      n=1000, nnz=100000, coalesce=((True, True))      |            120          |             84500
      n=1000, nnz=100000, coalesce=((True, False))     |            200          |             88560
      n=1000, nnz=100000, coalesce=((False, True))     |            160          |             89000
      n=1000, nnz=100000, coalesce=((False, False))    |            373          |             89000
      n=1000, nnz=1000000, coalesce=((True, True))     |            312          |            606400
      n=1000, nnz=1000000, coalesce=((True, False))    |           1340          |            609200
      n=1000, nnz=1000000, coalesce=((False, True))    |           1340          |            609100
      n=1000, nnz=1000000, coalesce=((False, False))   |           4408          |            611400

Times are in microseconds (us).
```

</details>

<details>

<summary>CPU</summary>

```
[------------------------------------------------ coo.mul ------------------------------------------------]
                                                       |  PR: mul, device: cpu  |  master: mul, device: cpu
24 threads: -----------------------------------------------------------------------------------------------
      n=10000, nnz=100, coalesce=((True, True))        |              8         |                8
      n=10000, nnz=100, coalesce=((True, False))       |             32         |               34
      n=10000, nnz=100, coalesce=((False, True))       |             32         |               34
      n=10000, nnz=100, coalesce=((False, False))      |             41         |               56
      n=100000, nnz=1000, coalesce=((True, True))      |             24         |               24
      n=100000, nnz=1000, coalesce=((True, False))     |             90         |              100
      n=100000, nnz=1000, coalesce=((False, True))     |             87         |              100
      n=100000, nnz=1000, coalesce=((False, False))    |            231         |              255
      n=1000000, nnz=10000, coalesce=((True, True))    |            190         |              200
      n=1000000, nnz=10000, coalesce=((True, False))   |            908         |             2023
      n=1000000, nnz=10000, coalesce=((False, True))   |            800         |             2036
      n=1000000, nnz=10000, coalesce=((False, False))  |           3684         |             3989
      n=10, nnz=100, coalesce=((True, True))           |              8         |                7
      n=10, nnz=100, coalesce=((True, False))          |             34         |               30
      n=10, nnz=100, coalesce=((False, True))          |             33         |               30
      n=10, nnz=100, coalesce=((False, False))         |             44         |               50
      n=10, nnz=1000, coalesce=((True, True))          |              8         |                7
      n=10, nnz=1000, coalesce=((True, False))         |            100         |              100
      n=10, nnz=1000, coalesce=((False, True))         |            130         |              100
      n=10, nnz=1000, coalesce=((False, False))        |            746         |              210
      n=10, nnz=10000, coalesce=((True, True))         |              8         |                7
      n=10, nnz=10000, coalesce=((True, False))        |           1000         |             1500
      n=10, nnz=10000, coalesce=((False, True))        |           1000         |             1510
      n=10, nnz=10000, coalesce=((False, False))       |           3063         |             2457
      n=100, nnz=1000, coalesce=((True, True))         |             25         |               25
      n=100, nnz=1000, coalesce=((True, False))        |            180         |              130
      n=100, nnz=1000, coalesce=((False, True))        |            200         |              130
      n=100, nnz=1000, coalesce=((False, False))       |            271         |              255
      n=100, nnz=10000, coalesce=((True, True))        |            100         |              100
      n=100, nnz=10000, coalesce=((True, False))       |           2444         |             2290
      n=100, nnz=10000, coalesce=((False, True))       |           2455         |             2357
      n=100, nnz=10000, coalesce=((False, False))      |           5316         |             3783
      n=1000, nnz=10000, coalesce=((True, True))       |            204         |              211
      n=1000, nnz=10000, coalesce=((True, False))      |           2457         |             2480
      n=1000, nnz=10000, coalesce=((False, True))      |           2448         |             2539
      n=1000, nnz=10000, coalesce=((False, False))     |           3665         |             4801
      n=1000, nnz=100000, coalesce=((True, True))      |           2293         |             2374
      n=1000, nnz=100000, coalesce=((True, False))     |           9000         |            24620
      n=1000, nnz=100000, coalesce=((False, True))     |           8000         |            25080
      n=1000, nnz=100000, coalesce=((False, False))    |          26500         |            47650
      n=1000, nnz=1000000, coalesce=((True, True))     |          10000         |            13000
      n=1000, nnz=1000000, coalesce=((True, False))    |          80000         |           362200
      n=1000, nnz=1000000, coalesce=((False, True))    |          78050         |           392600
      n=1000, nnz=1000000, coalesce=((False, False))   |         312100         |           766900

Times are in microseconds (us).
```

</details>

Pull Request resolved: #85336
Approved by: https://github.com/cpuhrsch
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
ciflow/trunk Trigger trunk jobs on your pull request cla signed Merged open source release notes: sparse release notes category Reverted topic: new features topic category
Projects
None yet
Development

Successfully merging this pull request may close these issues.

6 participants