In [5]:
import torch
x = torch.randn(2, requires_grad=True)
r = torch._C._EnablePythonDispatcher()
torch.add(x, x)

tensor([-0.4220,  0.5315], grad_fn=<AddBackward0>)

In [12]:
from torch._python_dispatcher import PythonDispatcher
dispatcher = PythonDispatcher()
dispatcher.register(["CPU", "XLA", "CompositeImplicitAutograd"])

RuntimeError: Tried to register an operator (__test__::foo(Tensor x) -> Tensor) with the same name and overload name multiple times. Each overload's schema should only be registered with a single call to def(). Duplicate registration: registered at /dev/null:0. Original registration: registered at /dev/null:0

In [14]:
torch._C._DisablePythonDispatcher()
print(dispatcher.dispatchTable())


Computed Dispatch Table
key             kernel
---------------------------
CPU             fn_CPU [kernel]
XLA             fn_XLA [kernel]
Lazy            fn_CompositeImplicitAutograd [math kernel]
FPGA            fn_CompositeImplicitAutograd [math kernel]
AutogradOther   fn_CompositeImplicitAutograd [math kernel]
AutogradCPU     [backend fallback]
AutogradXLA     [backend fallback]
AutogradLazy    fn_CompositeImplicitAutograd [math kernel]



In [15]:
torch._C._EnablePythonDispatcher()
print(dispatcher.dispatchTable())


Computed Dispatch Table
key             kernel
---------------------------
CPU             fn_CPU [kernel]
XLA             fn_XLA [kernel]
Lazy            fn_CompositeImplicitAutograd [math kernel]
FPGA            fn_CompositeImplicitAutograd [math kernel]
AutogradOther   fn_CompositeImplicitAutograd [math kernel]
AutogradCPU     [backend fallback]
AutogradXLA     [backend fallback]
AutogradLazy    fn_CompositeImplicitAutograd [math kernel]



In [8]:
print(dispatcher.keys())

['CPU', 'AutogradCPU', 'FPGA', 'AutogradOther', 'XLA', 'AutogradXLA', 'Lazy', 'AutogradLazy', 'CompositeExplicitAutograd', 'Autograd', 'CompositeImplicitAutograd']


In [9]:
print(dispatcher.registrations())


Registered Kernels
key             kernel
---------------------------
CPU             fn_CPU
XLA             fn_XLA
CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd



In [10]:
print(dispatcher.rawRegistrations())

name: __test__::foo
schema: __test__::foo(Tensor x) -> Tensor
debug: registered at /dev/null:0
alias analysis kind: FROM_SCHEMA
CPU: fn_CPU :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
XLA: fn_XLA :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
CompositeImplicitAutograd[alias]: fn_CompositeImplicitAutograd :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]



In [11]:
print(dispatcher.rawDispatchTable())

Undefined: fn_CompositeImplicitAutograd [math kernel]
CPU: fn_CPU [kernel]
CUDA: fn_CompositeImplicitAutograd [math kernel]
HIP: fn_CompositeImplicitAutograd [math kernel]
XLA: fn_XLA [kernel]
MPS: fn_CompositeImplicitAutograd [math kernel]
IPU: fn_CompositeImplicitAutograd [math kernel]
XPU: fn_CompositeImplicitAutograd [math kernel]
HPU: fn_CompositeImplicitAutograd [math kernel]
VE: fn_CompositeImplicitAutograd [math kernel]
Lazy: fn_CompositeImplicitAutograd [math kernel]
MTIA: fn_CompositeImplicitAutograd [math kernel]
PrivateUse1: fn_CompositeImplicitAutograd [math kernel]
PrivateUse2: fn_CompositeImplicitAutograd [math kernel]
PrivateUse3: fn_CompositeImplicitAutograd [math kernel]
Meta: fn_CompositeImplicitAutograd [math kernel]
FPGA: fn_CompositeImplicitAutograd [math kernel]
ORT: fn_CompositeImplicitAutograd [math kernel]
Vulkan: fn_CompositeImplicitAutograd [math kernel]
Metal: fn_CompositeImplicitAutograd [math kernel]
QuantizedCPU: fn_CompositeImplicitAutograd [math kernel

In [1]:
import torch
from torch import Tensor
from torch._custom_op.impl import custom_op
import numpy as np
from torch._dispatch.python import enable_crossref_functionalize
torch._C._DisablePythonDispatcher()
@custom_op("my_library::numpy_sin")
def numpy_sin(x: Tensor) -> Tensor:
    ...
# numpy_sin is now an instance of class CustomOp
# print(type(numpy_sin))
# Step 2: Register an implementation for various PyTorch subsystems
# Register an implementation for CPU tensors
@numpy_sin.impl('cpu')
def numpy_sin_impl_cpu(x):
    return torch.from_numpy(np.sin(x.numpy()))
# Register an implementation for CUDA tensors
@numpy_sin.impl('cuda')
def numpy_sin_impl_cuda(x):
    return torch.from_numpy(np.sin(x.cpu().numpy())).to(x.device)
x = torch.randn(3)
with enable_crossref_functionalize():
    y = numpy_sin(x)  # calls numpy_sin_impl_cpu
    print(y)
    x_cuda = x.cuda()
    
    y = numpy_sin(x)  # calls numpy_sin_impl_cuda
    print(y)

tensor([0.5863, 0.9270, 0.9825])
tensor([0.5863, 0.9270, 0.9825])


In [9]:
import torch
from torch.fx.experimental.proxy_tensor import make_fx
def test_pre_dispatch_mode_stack():
    def f(a):
        b = torch.ones(4, 4)
        b = numpy_sin(b)
        return torch.matmul(a, b)
    # We expect to see matmul in the trace - it should NOT be decomposed into mm.
    # Also, torch.ones() doesn't show up in the trace.
    # This is annoying but expected: ones() never dispatches to the Autograd dispatch key,
    # so our mode never sees it - it goes directly to the BackendSelect key.
    inp = torch.ones(4, 4)
    # Test that make_fx(pre_dispatch=True) clears caches properly.
    from torch._dispatch.python import enable_python_dispatcher, no_python_dispatcher
    with enable_python_dispatcher():
        out1 = f(inp)
    fx_g = make_fx(f, pre_dispatch=True)(inp)

    print(fx_g.code.strip())

test_pre_dispatch_mode_stack()

def forward(self, a_1):
    ones = torch.ops.aten.ones.default([4, 4], device = device(type='cpu'), pin_memory = False)
    numpy_sin = torch.ops.my_library.numpy_sin.default(ones);  ones = None
    matmul = torch.ops.aten.matmul.default(a_1, numpy_sin);  a_1 = numpy_sin = None
    return matmul


In [12]:
from torch._python_dispatcher import PythonDispatcher
dispatcher = PythonDispatcher()

RuntimeError: Tried to register an operator (__test__::foo(Tensor x) -> Tensor) with the same name and overload name multiple times. Each overload's schema should only be registered with a single call to def(). Duplicate registration: registered at /dev/null:0. Original registration: registered at /dev/null:0

In [13]:
dispatcher.register(["CPU", "XLA", "CompositeImplicitAutograd"])
print(dispatcher.dispatchTable())


Computed Dispatch Table
key             kernel
---------------------------
CPU             fn_CPU [kernel]
XLA             fn_XLA [kernel]
Lazy            fn_CompositeImplicitAutograd [math kernel]
FPGA            fn_CompositeImplicitAutograd [math kernel]
AutogradOther   fn_CompositeImplicitAutograd [math kernel]
AutogradCPU     [backend fallback]
AutogradXLA     [backend fallback]
AutogradLazy    fn_CompositeImplicitAutograd [math kernel]



In [14]:
from torch.utils._python_dispatch import _get_current_dispatch_mode_stack

In [15]:
_get_current_dispatch_mode_stack()

[]