In [None]:
import platform
import psutil
from typing import Tuple, Union
from timeit import timeit
from warnings import warn

# PyTorch dependencies
import torch
import torch.backends.opt_einsum as opt_einsum
from torch import Tensor

# Internal dependencies
from thoad import backward, Controller

In [None]:
# control size of tensors
TENSOR_SCALE: Union[int, float] = 1
REPEAT_SCALE: Union[int, float] = 1

In [3]:
sys: platform.uname_result = platform.uname()
print(f"system           {sys.system} {sys.release} {sys.version}")

system           Windows 11 10.0.26100


In [4]:
dev: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if dev.type == 'cuda':
    idx = dev.index if dev.index is not None else 0
    props: "_CudaDeviceProperties" = torch.cuda.get_device_properties(idx)
    name: str = props.name
    total_mem_gb: float = props.total_memory / (1024**3)
    print(f"using device     {dev} -> {name}")
    print(f"device memory    {total_mem_gb:.1f} GB)")
else:
    cpu_name: str = platform.processor() or "CPU"
    print(f"using device     {dev} -> {cpu_name}")
    print(f"physical cores   {psutil.cpu_count(logical=False)}")
    print(f"logical cores    {psutil.cpu_count(logical=True)}")

using device     cpu -> AMD64 Family 23 Model 160 Stepping 0, AuthenticAMD
physical cores   4
logical cores    8


In [None]:
if opt_einsum.is_available():
    opt_einsum.enabled = True
    opt_einsum.strategy = "greedy"
    print("opt_einsum backend enabled")
else:
    warn(
        "opt_einsum backend is not available. "
        "For better performance, install and enable opt_einsum.",
        UserWarning
    )

opt_einsum backend enabled


definition of MLP

In [6]:
def foward_pass(X: Tensor, *params) -> Tensor:
    T: Tensor = X
    for i, P in enumerate(params):
        last_step: bool = i == (len(params) - 1)
        T = T @ P
        T = torch.softmax(T, dim=1) if last_step else torch.relu(T)
    return T.sum()

## **Benchmark jacobians on full MLP**

definition of helper functions to meassure jacobian times

In [7]:
def time_autograd_jacobian(param_grad: bool, reps: int, X: Tensor, *params) -> float:
    def _fixed_forward_pass(X) -> Tensor:
        return foward_pass(X, *params)
    def _foward_and_backward() -> None:
        if param_grad:
            torch.autograd.functional.jacobian(func=foward_pass, inputs=(X, *params))
        else:
            torch.autograd.functional.jacobian(func=_fixed_forward_pass, inputs=X)
        return None
    time: float = timeit(
        lambda: _foward_and_backward(),
        number=reps,
    )
    return time

def time_thoad_jacobian(param_grad: bool, reps: int, X: Tensor, *params) -> float:
    X.requires_grad_(True)
    params: list[Tensor] = [P.requires_grad_(param_grad) for P in params]
    def _foward_and_backward() -> None:
        T: Tensor = foward_pass(X, *params)
        ctrl: Controller = backward(tensor=T, order=1, crossings=param_grad, keep_batch=True)
        ctrl.clear()
        return None
    time: float = timeit(
        lambda: _foward_and_backward(),
        number=reps,
    )
    return time

**jacobian** computational cost w.r.t. **batch size**

In [None]:
for batch_size in [10, 20, 30, 40, 50, 60, 70, 80]:
    param_size: int = int(10 * TENSOR_SCALE)
    x_shape: Tuple[int, int] = (batch_size, param_size)
    p_shape: Tuple[int, int] = (param_size, param_size)

    X: Tensor = torch.rand(size=x_shape, device=dev)
    params: list[Tensor] = [torch.rand(size=p_shape, device=dev) for _ in range(3)]

    reps: int = int(200 * (1/batch_size) * REPEAT_SCALE)
    autograd_time: float = time_autograd_jacobian(False, reps, X, *params)
    thoad_time: float = time_thoad_jacobian(False, reps, X, *params)

    print(
        f"batch size: {batch_size:03d} -> "
        f"autograd: {autograd_time / reps:.4f}  thoad: {thoad_time / reps:.4f}"
    )

batch size: 010 -> autograd: 0.0007  thoad: 0.0140
batch size: 020 -> autograd: 0.0003  thoad: 0.0081
batch size: 030 -> autograd: 0.0007  thoad: 0.0095
batch size: 040 -> autograd: 0.0004  thoad: 0.0117
batch size: 050 -> autograd: 0.0007  thoad: 0.0104
batch size: 060 -> autograd: 0.0005  thoad: 0.0101
batch size: 070 -> autograd: 0.0005  thoad: 0.0109
batch size: 080 -> autograd: 0.0006  thoad: 0.0106


**jacobian** computational cost w.r.t. **param size**

In [None]:
for param_size in [10, 20, 30, 40, 50, 60, 70, 80]:
    batch_size: int = int(10 * TENSOR_SCALE)
    x_shape: Tuple[int, int] = (batch_size, param_size)
    p_shape: Tuple[int, int] = (param_size, param_size)

    X: Tensor = torch.rand(size=x_shape, device=dev)
    params: list[Tensor] = [torch.rand(size=p_shape, device=dev) for _ in range(3)]

    reps: int = int(200 * (1/param_size) * REPEAT_SCALE)
    autograd_time: float = time_autograd_jacobian(False, reps, X, *params)
    thoad_time: float = time_thoad_jacobian(False, reps, X, *params)

    print(
        f"param size: {param_size:03d} -> "
        f"autograd: {autograd_time / reps:.4f}  thoad: {thoad_time / reps:.4f}"
    )

param size: 010 -> autograd: 0.0008  thoad: 0.0109
param size: 020 -> autograd: 0.0005  thoad: 0.0122
param size: 030 -> autograd: 0.0006  thoad: 0.0096
param size: 040 -> autograd: 0.0003  thoad: 0.0091
param size: 050 -> autograd: 0.0005  thoad: 0.0090
param size: 060 -> autograd: 0.0005  thoad: 0.0096
param size: 070 -> autograd: 0.0006  thoad: 0.0098
param size: 080 -> autograd: 0.0006  thoad: 0.0136


**jacobian** computational cost w.r.t. **graph depth**

In [None]:
for depth in range(2, 21):
    batch_size: int = 60
    param_size: int = int(20 * TENSOR_SCALE)
    x_shape: Tuple[int, int] = (batch_size, param_size)
    p_shape: Tuple[int, int] = (param_size, param_size)

    X: Tensor = torch.rand(size=x_shape, device=dev)
    params: list[Tensor] = [torch.rand(size=p_shape, device=dev) for _ in range(depth)]

    reps: int = int(20 * (1/depth) * REPEAT_SCALE)
    autograd_time: float = time_autograd_jacobian(False, reps, X, *params)
    thoad_time: float = time_thoad_jacobian(False, reps, X, *params)

    print(
        f"graph depth: {depth:02d} -> "
        f"autograd: {autograd_time / reps:.4f}  thoad: {thoad_time / reps:.4f}"
    )

graph depth: 02 -> autograd: 0.0011  thoad: 0.0100
graph depth: 03 -> autograd: 0.0005  thoad: 0.0095
graph depth: 04 -> autograd: 0.0005  thoad: 0.0116
graph depth: 05 -> autograd: 0.0005  thoad: 0.0115
graph depth: 06 -> autograd: 0.0005  thoad: 0.0132
graph depth: 07 -> autograd: 0.0008  thoad: 0.0162
graph depth: 08 -> autograd: 0.0010  thoad: 0.0179
graph depth: 09 -> autograd: 0.0007  thoad: 0.0176
graph depth: 10 -> autograd: 0.0008  thoad: 0.0190
graph depth: 11 -> autograd: 0.0015  thoad: 0.0231
graph depth: 12 -> autograd: 0.0009  thoad: 0.0232
graph depth: 13 -> autograd: 0.0015  thoad: 0.0278
graph depth: 14 -> autograd: 0.0011  thoad: 0.0288
graph depth: 15 -> autograd: 0.0010  thoad: 0.0339
graph depth: 16 -> autograd: 0.0011  thoad: 0.0344
graph depth: 17 -> autograd: 0.0014  thoad: 0.0470
graph depth: 18 -> autograd: 0.0019  thoad: 0.0540
graph depth: 19 -> autograd: 0.0015  thoad: 0.0502
graph depth: 20 -> autograd: 0.0015  thoad: 0.0540


**jacobian** computational cost w.r.t. **batch size** (param gradients included)

In [None]:
for batch_size in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]:
    param_size: int = int(10 * TENSOR_SCALE)
    x_shape: Tuple[int, int] = (batch_size, param_size)
    p_shape: Tuple[int, int] = (param_size, param_size)

    X: Tensor = torch.rand(size=x_shape, device=dev)
    params: list[Tensor] = [torch.rand(size=p_shape, device=dev) for _ in range(3)]

    reps: int = int(200 * (1/batch_size) * REPEAT_SCALE)
    autograd_time: float = time_autograd_jacobian(True, reps, X, *params)
    thoad_time: float = time_thoad_jacobian(True, reps, X, *params)

    print(
        f"batch size: {batch_size:02d} -> "
        f"autograd: {autograd_time / reps:.4f}  thoad: {thoad_time / reps:.4f}"
    )

batch size: 05 -> autograd: 0.0006  thoad: 0.0152
batch size: 10 -> autograd: 0.0005  thoad: 0.0144
batch size: 15 -> autograd: 0.0005  thoad: 0.0147
batch size: 20 -> autograd: 0.0005  thoad: 0.0160
batch size: 25 -> autograd: 0.0007  thoad: 0.0174
batch size: 30 -> autograd: 0.0007  thoad: 0.0166
batch size: 35 -> autograd: 0.0005  thoad: 0.0158
batch size: 40 -> autograd: 0.0006  thoad: 0.0176
batch size: 45 -> autograd: 0.0008  thoad: 0.0160
batch size: 50 -> autograd: 0.0008  thoad: 0.0161


**jacobian** computational cost w.r.t. **param size** (param gradients included)

In [None]:
for param_size in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]:
    batch_size: int = int(10 * TENSOR_SCALE)
    x_shape: Tuple[int, int] = (batch_size, param_size)
    p_shape: Tuple[int, int] = (param_size, param_size)

    X: Tensor = torch.rand(size=x_shape, device=dev)
    params: list[Tensor] = [torch.rand(size=p_shape, device=dev) for _ in range(3)]

    reps: int = int(200 * (1/param_size) * REPEAT_SCALE)
    autograd_time: float = time_autograd_jacobian(True, reps, X, *params)
    thoad_time: float = time_thoad_jacobian(True, reps, X, *params)

    print(
        f"param size: {param_size:02d} -> "
        f"autograd: {autograd_time / reps:.4f}  thoad: {thoad_time / reps:.4f}"
    )

param size: 05 -> autograd: 0.0007  thoad: 0.0163
param size: 10 -> autograd: 0.0005  thoad: 0.0175
param size: 15 -> autograd: 0.0005  thoad: 0.0167
param size: 20 -> autograd: 0.0006  thoad: 0.0174
param size: 25 -> autograd: 0.0005  thoad: 0.0180
param size: 30 -> autograd: 0.0011  thoad: 0.0217
param size: 35 -> autograd: 0.0011  thoad: 0.0201
param size: 40 -> autograd: 0.0005  thoad: 0.0195
param size: 45 -> autograd: 0.0006  thoad: 0.0173
param size: 50 -> autograd: 0.0008  thoad: 0.0167


**jacobian** computational cost w.r.t. **graph depth** (param gradients included)

In [None]:
for depth in range(2, 7):
    batch_size: int = 10
    param_size: int = int(10 * TENSOR_SCALE)
    x_shape: Tuple[int, int] = (batch_size, param_size)
    p_shape: Tuple[int, int] = (param_size, param_size)

    X: Tensor = torch.rand(size=x_shape, device=dev)
    params: list[Tensor] = [torch.rand(size=p_shape, device=dev) for _ in range(depth)]

    reps: int = int(200 * (1/depth) * REPEAT_SCALE)
    autograd_time: float = time_autograd_jacobian(True, reps, X, *params)
    thoad_time: float = time_thoad_jacobian(True, reps, X, *params)

    print(
        f"graph depth: {depth:02d} -> "
        f"autograd: {autograd_time / reps:.4f}  thoad: {thoad_time / reps:.4f}"
    )

graph depth: 02 -> autograd: 0.0012  thoad: 0.0139
graph depth: 03 -> autograd: 0.0005  thoad: 0.0174
graph depth: 04 -> autograd: 0.0007  thoad: 0.0213
graph depth: 05 -> autograd: 0.0006  thoad: 0.0237
graph depth: 06 -> autograd: 0.0012  thoad: 0.0345


## **Benchmark hessians on full MLP**

definition of helper functions to meassure hessian times

In [14]:
def time_autograd_hessian(param_grad: bool, reps: int, X: Tensor, *params) -> float:
    def _fixed_forward_pass(X) -> Tensor:
        return foward_pass(X, *params)
    def _foward_and_backward() -> None:
        if param_grad:
            torch.autograd.functional.hessian(func=foward_pass, inputs=(X, *params))
        else:
            torch.autograd.functional.hessian(func=_fixed_forward_pass, inputs=X)
        return None
    time: float = timeit(
        lambda: _foward_and_backward(),
        number=reps,
    )
    return time

def time_thoad_hessian(param_grad: bool, reps: int, X: Tensor, *params) -> float:
    X.requires_grad_(True)
    params: list[Tensor] = [P.requires_grad_(param_grad) for P in params]
    def _foward_and_backward() -> None:
        T: Tensor = foward_pass(X, *params)
        ctrl: Controller = backward(tensor=T, order=2, crossings=param_grad, keep_batch=True)
        ctrl.clear()
        return None
    time: float = timeit(
        lambda: _foward_and_backward(),
        number=reps,
    )
    return time

**hessian** computational cost w.r.t. **batch size**

In [None]:
for batch_size in [10, 20, 30, 40, 50, 60, 70, 80]:
    param_size: int = int(10 * TENSOR_SCALE)
    x_shape: Tuple[int, int] = (batch_size, param_size)
    p_shape: Tuple[int, int] = (param_size, param_size)

    X: Tensor = torch.rand(size=x_shape, device=dev)
    params: list[Tensor] = [torch.rand(size=p_shape, device=dev) for _ in range(3)]

    reps: int = int(200 * (1/batch_size) * REPEAT_SCALE)
    autograd_time: float = time_autograd_hessian(False, reps, X, *params)
    thoad_time: float = time_thoad_hessian(False, reps, X, *params)

    print(
        f"batch size: {batch_size:03d} -> "
        f"autograd: {autograd_time / reps:.4f}  thoad: {thoad_time / reps:.4f}"
    )

batch size: 010 -> autograd: 0.0322  thoad: 0.0179
batch size: 020 -> autograd: 0.0774  thoad: 0.0170
batch size: 030 -> autograd: 0.1125  thoad: 0.0174
batch size: 040 -> autograd: 0.1673  thoad: 0.0196
batch size: 050 -> autograd: 0.1987  thoad: 0.0188
batch size: 060 -> autograd: 0.2464  thoad: 0.0178
batch size: 070 -> autograd: 0.3157  thoad: 0.0175
batch size: 080 -> autograd: 0.3139  thoad: 0.0221


**hessian** computational cost w.r.t. **param size**

In [None]:
for param_size in [10, 20, 30, 40, 50, 60, 70, 80]:
    batch_size: int = int(10 * TENSOR_SCALE)
    x_shape: Tuple[int, int] = (batch_size, param_size)
    p_shape: Tuple[int, int] = (param_size, param_size)

    X: Tensor = torch.rand(size=x_shape, device=dev)
    params: list[Tensor] = [torch.rand(size=p_shape, device=dev) for _ in range(3)]

    reps: int = int(200 * (1/param_size) * REPEAT_SCALE)
    autograd_time: float = time_autograd_hessian(False, reps, X, *params)
    thoad_time: float = time_thoad_hessian(False, reps, X, *params)

    print(
        f"param size: {param_size:02d} -> "
        f"autograd: {autograd_time / reps:.4f}  thoad: {thoad_time / reps:.4f}"
    )

param size: 10 -> autograd: 0.0325  thoad: 0.0218
param size: 20 -> autograd: 0.0819  thoad: 0.0182
param size: 30 -> autograd: 0.1147  thoad: 0.0186
param size: 40 -> autograd: 0.1766  thoad: 0.0236
param size: 50 -> autograd: 0.2110  thoad: 0.0253
param size: 60 -> autograd: 0.2792  thoad: 0.0323
param size: 70 -> autograd: 0.2708  thoad: 0.0348
param size: 80 -> autograd: 0.3115  thoad: 0.0454


**hessian** computational cost w.r.t. **graph depth**

In [None]:
for depth in range(2, 11):
    input_size: int = 20
    param_size: int = int(20 * TENSOR_SCALE)
    x_shape: Tuple[int, int] = (batch_size, param_size)
    p_shape: Tuple[int, int] = (param_size, param_size)

    X: Tensor = torch.rand(size=x_shape, device=dev)
    params: list[Tensor] = [torch.rand(size=p_shape, device=dev) for _ in range(depth)]

    reps: int = int(20 * (1/depth) * REPEAT_SCALE)
    autograd_time: float = time_autograd_hessian(False, reps, X, *params)
    thoad_time: float = time_thoad_hessian(False, reps, X, *params)

    print(
        f"graph depth: {depth:02d} -> "
        f"autograd: {autograd_time / reps:.4f}  thoad: {thoad_time / reps:.4f}"
    )

graph depth: 02 -> autograd: 0.0754  thoad: 0.0139
graph depth: 03 -> autograd: 0.0794  thoad: 0.0335
graph depth: 04 -> autograd: 0.0941  thoad: 0.0253
graph depth: 05 -> autograd: 0.1173  thoad: 0.0385
graph depth: 06 -> autograd: 0.1258  thoad: 0.0344
graph depth: 07 -> autograd: 0.1352  thoad: 0.0338
graph depth: 08 -> autograd: 0.1468  thoad: 0.0407
graph depth: 09 -> autograd: 0.1652  thoad: 0.0453
graph depth: 10 -> autograd: 0.1897  thoad: 0.0622


**hessian** computational cost w.r.t. **batch size** (param gradients included)

In [None]:
for batch_size in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]:
    param_size: int = int(10 * TENSOR_SCALE)
    x_shape: Tuple[int, int] = (batch_size, param_size)
    p_shape: Tuple[int, int] = (param_size, param_size)

    X: Tensor = torch.rand(size=x_shape, device=dev)
    params: list[Tensor] = [torch.rand(size=p_shape, device=dev) for _ in range(3)]

    reps: int = int(20 * (1/batch_size) * REPEAT_SCALE)
    autograd_time: float = time_autograd_hessian(True, reps, X, *params)
    thoad_time: float = time_thoad_hessian(True, reps, X, *params)

    print(
        f"batch size: {batch_size:02d} -> "
        f"autograd: {autograd_time / reps:.4f}  thoad: {thoad_time / reps:.4f}"
    )

batch size: 05 -> autograd: 0.1456  thoad: 0.0527
batch size: 10 -> autograd: 0.1735  thoad: 0.0516
batch size: 15 -> autograd: 0.1929  thoad: 0.0550
batch size: 20 -> autograd: 0.2536  thoad: 0.0519
batch size: 25 -> autograd: 0.2782  thoad: 0.0517
batch size: 30 -> autograd: 0.3074  thoad: 0.0572
batch size: 35 -> autograd: 0.3403  thoad: 0.0564
batch size: 40 -> autograd: 0.4598  thoad: 0.0594
batch size: 45 -> autograd: 0.4883  thoad: 0.0590
batch size: 50 -> autograd: 0.4837  thoad: 0.0551


**hessian** computational cost w.r.t. **param size** (param gradients included)

In [None]:
for param_size in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]:
    batch_size: int = int(10 * TENSOR_SCALE)
    x_shape: Tuple[int, int] = (batch_size, param_size)
    p_shape: Tuple[int, int] = (param_size, param_size)

    X: Tensor = torch.rand(size=x_shape, device=dev)
    params: list[Tensor] = [torch.rand(size=p_shape, device=dev) for _ in range(3)]

    reps: int = int(200 * (1/param_size) * REPEAT_SCALE)
    autograd_time: float = time_autograd_hessian(True, reps, X, *params)
    thoad_time: float = time_thoad_hessian(True, reps, X, *params)

    print(
        f"param size: {param_size:02d} -> "
        f"autograd: {autograd_time / reps:.4f}  thoad: {thoad_time / reps:.4f}"
    )

param size: 05 -> autograd: 0.0545  thoad: 0.0586
param size: 10 -> autograd: 0.1825  thoad: 0.0529
param size: 15 -> autograd: 0.4287  thoad: 0.0532
param size: 20 -> autograd: 0.6604  thoad: 0.0658
param size: 25 -> autograd: 0.9822  thoad: 0.0618
param size: 30 -> autograd: 1.2951  thoad: 0.1134
param size: 35 -> autograd: 2.0411  thoad: 0.1073
param size: 40 -> autograd: 3.6852  thoad: 0.1747
param size: 45 -> autograd: 4.2953  thoad: 0.2358
param size: 50 -> autograd: 4.9012  thoad: 0.3049


**hessian** computational cost w.r.t. **graph depth** (param gradients included)

In [None]:
for depth in [2, 3, 4, 5, 6, 7, 8]:
    batch_size: int = 10
    param_size: int = int(10 * TENSOR_SCALE)
    x_shape: Tuple[int, int] = (batch_size, param_size)
    p_shape: Tuple[int, int] = (param_size, param_size)

    X: Tensor = torch.rand(size=x_shape, device=dev)
    params: list[Tensor] = [torch.rand(size=p_shape, device=dev) for _ in range(depth)]

    reps: int = int(20 * (1/depth) * REPEAT_SCALE)
    autograd_time: float = time_autograd_hessian(True, reps, X, *params)
    thoad_time: float = time_thoad_hessian(True, reps, X, *params)

    print(
        f"graph depth: {depth:02d} -> "
        f"autograd: {autograd_time / reps:.4f}  thoad: {thoad_time / reps:.4f}"
    )

graph depth: 02 -> autograd: 0.1375  thoad: 0.0324
graph depth: 03 -> autograd: 0.1757  thoad: 0.0628
graph depth: 04 -> autograd: 0.2567  thoad: 0.0781
graph depth: 05 -> autograd: 0.3848  thoad: 0.1105
graph depth: 06 -> autograd: 0.5297  thoad: 0.1862
graph depth: 07 -> autograd: 0.5988  thoad: 0.2180
graph depth: 08 -> autograd: 0.7029  thoad: 0.2517
