# Introduction

In [1]:
from collections import OrderedDict
import functools

import math
import torch
from torch.distributions import constraints

%env FUNSOR_TYPECHECK=1
import funsor
from funsor.terms import Funsor, Variable, Number, Lambda, Slice
from funsor.tensor import Tensor
from funsor.domains import Array, Bint, Real, Reals
from funsor.factory import Bound, Fresh, Has, Value, make_funsor, to_funsor
import funsor.ops as ops
from funsor.cnf import Contraction
from funsor.testing import random_tensor
from funsor.interpretations import reflect, memoize
import funsor.torch.distributions as dist

funsor.set_backend("torch")
torch.set_default_dtype(torch.float32)

env: FUNSOR_TYPECHECK=1


# Examples

## Building blocks

In [2]:
class Layer:
    def __init__(self) -> None:
        pass
    
    def forward(self, x: Tensor) -> Tensor:
        raise NotImplementedError
        
    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

### Feedforward neural networks

\begin{aligned}
  X^0 &\in \mathbb{R}^{\mathsf{\vphantom{fg}input}} \\
  X^1 &= \sigma(W^1 \mathbin{\underset{\substack{\mathsf{\vphantom{fg}input}}}{\vphantom{fg}\odot}} X^0 + b^1) & W^1 &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}_1 \times \mathsf{\vphantom{fg}input}} & b^1 &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}_1} \\
  X^2 &= \sigma(W^2 \mathbin{\underset{\substack{\mathsf{\vphantom{fg}hidden}_1}}{\vphantom{fg}\odot}} X^1 + b^2) & W^2 &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}_2 \times \mathsf{\vphantom{fg}hidden}_1} & b^2 &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}_2} \\
  X^3 &= \sigma(W^3 \mathbin{\underset{\substack{\mathsf{\vphantom{fg}hidden}_2}}{\vphantom{fg}\odot}} X^2 + b^3) & W^3 &\in \mathbb{R}^{\mathsf{\vphantom{fg}out}\times \mathsf{\vphantom{fg}hidden}_2} & b^3 &\in \mathbb{R}^{\mathsf{\vphantom{fg}out}}
\end{aligned}

$$
\begin{aligned}
x &\in \mathbb{R}^{\mathsf{\vphantom{fg}layer}[n_0]} \\
W^l &\in \mathbb{R}^{\mathsf{\vphantom{fg}layer^2}[n_l] \times \mathsf{\vphantom{fg}layer}[n_{l-1}]} \\
  b^l &\in \mathbb{R}^{\mathsf{\vphantom{fg}layer^2}[n_l]} \\
  \text{FullConn}^l(x) &= \sigma\left(W^l \mathbin{\underset{\substack{\mathsf{\vphantom{fg}layer}}}{\vphantom{fg}\odot}} x + b^l\right)_{\mathsf{\vphantom{fg}layer^2}\rightarrow\mathsf{\vphantom{fg}layer}}
\end{aligned}
$$

In [29]:
class FullConnLayer(Layer):
    def __init__(self, input_size: int, output_size: int) -> None:
        
        self.W.data.requires_grad = True
        
        self.b = random_tensor(
            OrderedDict([("output", Bint[output_size])])
        )
        self.b.data.requires_grad = True
        
    def forward(self, x: Funsor) -> Funsor:
        out = ops.sigmoid((self.W * x).reduce(ops.add, "input") + self.b)
        return out(**{"output": "input"})

In [8]:
@make_funsor
def FullConn(
    x: Has[{"layer"}],
    W: Has[{"layer"}],
    b: Funsor,
    layer: Bound
) -> Fresh[lambda x: x]:
    return ops.sigmoid((W * x).reduce(ops.add, layer) + b)

In [9]:
input_size = 100
output_size = 32

W = random_tensor(
    OrderedDict([
        ("input", Bint[input_size]),
        ("output", Bint[output_size])
    ])
)
b = random_tensor(OrderedDict([("output", Bint[output_size])]))
X = random_tensor(OrderedDict([("input", Bint[input_size])]))

FullConn(X, W, b, "input")

Tensor(tensor([1.4408e-02, 9.4229e-01, 9.9999e-01, 1.0000e+00, 1.2358e-02, 1.0000e+00,
               9.7387e-01, 8.2982e-02, 9.9977e-01, 1.0000e+00, 4.1625e-14, 2.8234e-11,
               1.8357e-01, 6.9307e-01, 5.8466e-05, 3.1844e-05, 6.4502e-03, 6.1327e-01,
               1.0000e+00, 5.5068e-10, 2.2673e-08, 1.0000e+00, 9.9690e-01, 9.9876e-01,
               7.6623e-05, 9.9874e-01, 9.9795e-01, 9.9376e-09, 9.9998e-01, 2.5546e-07,
               1.0000e+00, 9.9994e-01]), {'output': Bint[32]})

### Recurrent neural networks

$$
\begin{aligned}
x^{t} &\in \mathbb{R}^{\mathsf{\vphantom{fg}input}} & t &= 1, \ldots, n \\
W^{\text{h}} &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}\times \mathsf{\vphantom{fg}hidden}^\prime} & |\mathsf{\vphantom{fg}hidden}| &= |\mathsf{\vphantom{fg}hidden}^\prime| \\
W^{\text{i}} &\in \mathbb{R}^{\mathsf{\vphantom{fg}input}\times \mathsf{\vphantom{fg}hidden}^\prime} \\
b &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}^\prime} \\
h^{0} &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}} \\
h^{t} &= \sigma\left( W^{\text{h}} \mathbin{\underset{\substack{\mathsf{\vphantom{fg}hidden}}}{\vphantom{fg}\odot}} h^{t-1} + W^{\text{i}} \mathbin{\underset{\substack{\mathsf{\vphantom{fg}input}}}{\vphantom{fg}\odot}} x^{t} + b \right)_{\mathsf{\vphantom{fg}hidden}^\prime\rightarrow\mathsf{\vphantom{fg}hidden}} & t &= 1, \ldots, n
\end{aligned}
$$

In [11]:
@make_funsor
def RNN(
    x: Has[{"layer"}],
    Wh: Has[{"hidden"}],
    Wi: Has[{"layer"}],
    b: Funsor,
    h: Has[{"hidden"}],
    hidden: Bound,
    layer: Bound
) -> Fresh[lambda x: x]:
    return ops.sigmoid((Wh * h).reduce(ops.add, hidden) + (Wi * x).reduce(ops.add, layer) + b)

In [12]:
input_size = 100
hidden_size = 32

Wh = random_tensor(
    OrderedDict([
        ("hidden", Bint[hidden_size]),
        ("hidden2", Bint[hidden_size])
    ])
)
Wi = random_tensor(
    OrderedDict([
        ("input", Bint[input_size]),
        ("hidden2", Bint[hidden_size])
    ])
)
b = random_tensor(OrderedDict([("hidden2", Bint[hidden_size])]))
h = random_tensor(OrderedDict([("hidden", Bint[hidden_size])]))
x = random_tensor(OrderedDict([("input", Bint[input_size])]))

RNN(x, Wh, Wi, b, h, "hidden", "input")

Tensor(tensor([9.3445e-01, 4.5910e-09, 5.0218e-02, 9.9998e-01, 9.8076e-01, 2.9043e-01,
               1.4833e-03, 5.6110e-01, 3.0206e-05, 2.0272e-02, 9.5883e-08, 1.4432e-03,
               1.0000e+00, 3.7437e-01, 7.9057e-01, 1.0000e+00, 9.9968e-01, 6.2209e-01,
               4.0754e-03, 5.3359e-10, 9.9999e-01, 9.9553e-01, 9.3140e-06, 9.9989e-01,
               1.2673e-04, 2.7258e-04, 1.0000e+00, 4.9444e-01, 1.0000e+00, 9.0319e-01,
               1.0000e+00, 1.0000e+00]), {'hidden2': Bint[32]})

### Attention

In [13]:
@make_funsor
def Softmax(
    x: Funsor,
    ax: Bound,
    ax2: Fresh[lambda ax: ax]
) -> Fresh[lambda x: x]:
    x = x(**{ax.name: ax2.name})
    y = x - x.reduce(ops.logaddexp, ax2)
    return y.exp()

\begin{aligned}
  \text{Attention} \colon \mathbb{R}^{\mathsf{\vphantom{fg}key}} \times \mathbb{R}^{\mathsf{\vphantom{fg}seq}\times\mathsf{\vphantom{fg}key}} \times \mathbb{R}^{\mathsf{\vphantom{fg}seq}\times\mathsf{\vphantom{fg}val}} \times \mathbb{R}^{\mathsf{\vphantom{fg}seq}} &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}val}} \\
\text{Attention}(Q, K, V, M) &= \mathop{\underset{\substack{\mathsf{\vphantom{fg}seq}}}{\vphantom{fg}\mathrm{softmax}}} \left( \frac{Q \mathbin{\underset{\substack{\mathsf{\vphantom{fg}key}}}{\vphantom{fg}\odot}} K}{\sqrt{|\mathsf{\vphantom{fg}key}|}} + M \right) \mathbin{\underset{\substack{\mathsf{\vphantom{fg}seq}}}{\vphantom{fg}\odot}} V.
\end{aligned}

In [14]:
@make_funsor
def Attention(
    Q: Has[{"key"}],
    K: Has[{"key", "seq"}],
    V: Has[{"seq2"}],
    M: Has[{"seq"}],
    key: Bound,
    seq: Bound,
    seq2: Bound
) -> Fresh[lambda Q: Q]:
    x = (Q * K).reduce(ops.add, key) / math.sqrt(key.output.size) + M
    return (Softmax(x, seq, seq2) * V).reduce(ops.add, seq2)

In [15]:
q = random_tensor(OrderedDict([("key", Bint[10])]))
k = random_tensor(OrderedDict([("key", Bint[10]), ("seq", Bint[3])]))
v = random_tensor(OrderedDict([("seq2", Bint[3]), ("val", Bint[5])]))
m = random_tensor(OrderedDict([("seq", Bint[3])]))
Attention(q, k, v, m, "key", "seq", "seq2")

Tensor(tensor([-1.1324, -0.3813,  1.1015, -0.6993,  0.0259]), {'val': Bint[5]})

### Convolution

\begin{aligned}
  \mathop{\underset{\substack{\mathsf{\vphantom{fg}seq}\\ \mathsf{\vphantom{fg}kernel}}}{\vphantom{fg}\mathrm{unroll}}} \colon \mathbb{R}^{\mathsf{\vphantom{fg}seq}[n]} &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}seq}[n-|\mathsf{\vphantom{fg}kernel}|+1], \mathsf{\vphantom{fg}kernel}} \\
  \mathop{\underset{\substack{\mathsf{\vphantom{fg}seq}\\ \mathsf{\vphantom{fg}kernel}}}{\vphantom{fg}\mathrm{unroll}}} X &= Y,\ \text{where} \\
  Y_{\mathsf{\vphantom{fg}seq}(i), \mathsf{\vphantom{fg}kernel}(j)} &= X_{\mathsf{\vphantom{fg}seq}(i+j - 1)}.
\end{aligned}

In [16]:
@make_funsor
def Unroll(
    x: Has[{"seq"}],
    seq: Bound,
    k: Value[int],
    kernel: Fresh[lambda k: Bint[k]],
    seq2: Fresh[lambda seq, k: Bint[seq.size - k + 1]]
) -> Fresh[lambda x: x]:
    return x(**{seq.name: seq2 + kernel})

\begin{aligned}
\text{Conv1d} \colon \mathbb{R}^{\mathsf{\vphantom{fg}chans}\times \mathsf{\vphantom{fg}seq}[n]} &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}seq}[n^\prime]} \\
\text{Conv1d}(X; W, b) &= W \mathbin{\underset{\substack{\mathsf{\vphantom{fg}chans}\\ \mathsf{\vphantom{fg}kernel}}}{\vphantom{fg}\odot}} \mathop{\underset{\substack{\mathsf{\vphantom{fg}seq}\\ \mathsf{\vphantom{fg}kernel}}}{\vphantom{fg}\mathrm{unroll}}} X + b
\end{aligned}

\begin{aligned}
W &\in \mathbb{R}^{\mathsf{\vphantom{fg}chans}\times \mathsf{\vphantom{fg}kernel}} \\
b &\in \mathbb{R}\\
\end{aligned}

In [17]:
@make_funsor
def Conv1d(
    X: Has[{"chans", "seq"}],
    W: Has[{"chans", "kernel"}],
    b: Funsor,
    chans: Bound,
    k: Value[int],
    kernel: Bound,
    seq: Bound,
    seq2: Fresh[lambda seq, k: Bint[seq.size - k + 1]]
) -> Fresh[lambda X: X]:
    y = W * Unroll(X, seq, k, kernel, seq2)
    return y.reduce(ops.add, frozenset({chans, kernel})) + b

In [18]:
x = random_tensor(OrderedDict([("chans", Bint[3]), ("seq", Bint[10])]))
kernel = Variable("kernel", Bint[3])
w = random_tensor(OrderedDict([("chans", Bint[3]), ("kernel", Bint[3])]))
b = random_tensor(OrderedDict([]))

Conv1d(x, w, b, "chans", 3, "kernel", "seq", "seq2")

Tensor(tensor([-0.4755, -2.8636, -0.4447, -1.5040,  2.2609, -2.9911, -0.2384,  1.2441]), {'seq2': Bint[8]})

$$
\begin{aligned}
  \text{Conv2d} \colon \mathbb{R}^{\mathsf{\vphantom{fg}chans}\times \mathsf{\vphantom{fg}height}[h] \times \mathsf{\vphantom{fg}width}[w]}
  &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}height}[h2] \times \mathsf{\vphantom{fg}width}[w2]} \\
  \text{Conv2d}(X; W, b) &= W \mathbin{\underset{\substack{\mathsf{\vphantom{fg}chans}\\ \mathsf{\vphantom{fg}kh}, \mathsf{\vphantom{fg}kw}}}{\vphantom{fg}\odot}} \mathop{\underset{\substack{\mathsf{\vphantom{fg}height}\\ \mathsf{\vphantom{fg}kh}}}{\vphantom{fg}\mathrm{unroll}}} \mathop{\underset{\substack{\mathsf{\vphantom{fg}width}\\\mathsf{\vphantom{fg}kw}}}{\vphantom{fg}\mathrm{unroll}}} X + b\end{aligned}
$$

$$
\begin{aligned}
W &\in \mathbb{R}^{\mathsf{\vphantom{fg}chans}\times \mathsf{\vphantom{fg}kh}\times \mathsf{\vphantom{fg}kw}} \\
b &\in \mathbb{R}.
\end{aligned}
$$

In [20]:
@make_funsor
def Conv2d(
    X: Has[{"chans", "height", "width"}],
    W: Has[{"chans", "kh", "kw"}],
    b: Funsor,
    chans: Bound,
    kh_size: Value[int],
    kh: Bound,
    height: Bound,
    height2: Fresh[lambda height, kh_size: Bint[height.size - kh_size + 1]],
    kw_size: Value[int],
    kw: Bound,
    width: Bound,
    width2: Fresh[lambda width, kw_size: Bint[width.size - kw_size + 1]]
) -> Fresh[lambda X: X]:
    y = W * Unroll(Unroll(X, width, kw_size, kw, width2), height, kh_size, kh, height2)
    return y.reduce(ops.add, frozenset({chans, kh, kw})) + b

In [21]:
x = random_tensor(OrderedDict([("chans", Bint[3]), ("height", Bint[10]), ("width", Bint[8])]))
w = random_tensor(OrderedDict([("chans", Bint[3]), ("kh", Bint[3]), ("kw", Bint[4])]))
b = random_tensor(OrderedDict([]))

Conv2d(x, w, b, "chans", 3, "kh", "height", "height2", 4, "kw", "width", "width2")

Tensor(tensor([[ -3.6341,  11.4603,  -6.3920,   3.8056, -13.6978],
               [  0.5156,  11.6967,  -5.3698,   0.3156,  -2.5205],
               [ -9.1049,   6.2981,  -2.8979,   0.5617,   2.9153],
               [  5.9847,  -6.3165, -10.1714,  16.8733,   6.1089],
               [-14.8901,  -6.7428,  19.7337,  -8.8260,   2.6238],
               [ -4.4954,   3.4033,  -1.4193, -10.6725,  10.8208],
               [  6.6792,  11.7001,   1.0700,  10.5096,  -2.4380],
               [  2.5830,   2.3628,  -8.9506,   2.1479,  15.7352]]), {'height2': Bint[8], 'width2': Bint[5]})

### Max pooling

$$
\begin{aligned}
  \mathop{\underset{\substack{\mathsf{\vphantom{fg}seq},\mathsf{\vphantom{fg}kernel}}}{\vphantom{fg}\mathrm{pool}}} \colon \mathbb{R}^{\mathsf{\vphantom{fg}seq}[n]} &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}seq}[n/|\mathsf{\vphantom{fg}kernel}|],\mathsf{\vphantom{fg}kernel}} \\
  \mathop{\underset{\substack{\mathsf{\vphantom{fg}seq},\mathsf{\vphantom{fg}kernel}}}{\vphantom{fg}\mathrm{pool}}} X &= Y,\ \text{where} \\
  Y_{\mathsf{\vphantom{fg}seq}(i), \mathsf{\vphantom{fg}kernel}(j)} &= X_{\mathsf{\vphantom{fg}seq}((i-1) \cdot |\mathsf{\vphantom{fg}kernel}| + j)}.
\end{aligned}
$$

In [22]:
@make_funsor
def Pool(
    x: Has[{"seq"}],
    seq: Bound,
    k: Value[int],
    kernel: Fresh[lambda k: Bint[k]],
    seq2: Fresh[lambda seq, k: Bint[seq.size // k]],
) -> Fresh[lambda x: x]:
    assert not seq.output.size % k
    return x(**{seq.name: seq2 * Number(k, k+1) + kernel})

In [23]:
X = random_tensor(OrderedDict([("seq", Bint[10])]))
Y = Pool(X, "seq", 2, "kernel", "seq2")
Y

Tensor(tensor([[ 0.4224, -1.2200],
               [-0.2514,  1.3845],
               [-0.5843,  0.0931],
               [-1.8701,  1.0598],
               [-1.6007,  0.2565]]), {'seq2': Bint[5], 'kernel': Bint[2]})

$$
\begin{aligned}
\text{MaxPool1d}_{k} \colon \mathbb{R}^{\mathsf{\vphantom{fg}seq}[n]} &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}seq}[n/k]} \\
\text{MaxPool1d}_{k}(X) &= \mathop{\underset{\substack{\mathsf{\vphantom{fg}kernel}}}{\vphantom{fg}\mathrm{max}}} \mathop{\underset{\substack{\mathsf{\vphantom{fg}seq},\mathsf{\vphantom{fg}kernel}}}{\vphantom{fg}\mathrm{pool}}} X \\
|\mathsf{\vphantom{fg}kernel}| &= k \\
\text{MaxPool2d}_{kh,kw} \colon \mathbb{R}^{\mathsf{\vphantom{fg}height}[h] \times \mathsf{\vphantom{fg}width}[w]} &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}height}[h/kh] \times \mathsf{\vphantom{fg}width}[w/kw]} \\
\text{MaxPool2d}_{kh,kw}(X) &= \mathop{\underset{\substack{\mathsf{\vphantom{fg}kh},\mathsf{\vphantom{fg}kw}}}{\vphantom{fg}\mathrm{max}}} \mathop{\underset{\substack{\mathsf{\vphantom{fg}height},\mathsf{\vphantom{fg}kh}}}{\vphantom{fg}\mathrm{pool}}} \mathop{\underset{\substack{\mathsf{\vphantom{fg}width},\mathsf{\vphantom{fg}kw}}}{\vphantom{fg}\mathrm{pool}}} X \\
|\mathsf{\vphantom{fg}kh}| &= kh \\
|\mathsf{\vphantom{fg}kw}| &= kw.
\end{aligned}
$$

In [24]:
@make_funsor
def MaxPool1d(
    X: Has[{"seq"}],
    seq: Bound,
    k: Value[int],
    kernel: Fresh[lambda k: Bint[k]],
    seq2: Fresh[lambda seq, k: Bint[seq.size // k]]
) -> Fresh[lambda X: X]:
    return Pool(X, seq, k, kernel, seq2).reduce(ops.max, kernel)

In [25]:
X = random_tensor(OrderedDict([("seq", Bint[10])]))
Y = MaxPool1d(X, "seq", 2, "kernel", "seq2")
Y

Tensor(tensor([-0.1513,  1.1031,  0.9016,  0.1121, -0.0885]), {'seq2': Bint[5]})

In [26]:
@make_funsor
def MaxPool2d(
    X: Has[{"height", "width"}],
    height: Bound,
    kh_size: Value[int],
    kh: Fresh[lambda kh_size: Bint[kh_size]],
    height2: Fresh[lambda height, kh_size: Bint[height.size // kh_size]],
    width: Bound,
    kw_size: Value[int],
    kw: Fresh[lambda kw_size: Bint[kw_size]],
    width2: Fresh[lambda width, kw_size: Bint[width.size // kw_size]],
) -> Fresh[lambda X: X]:
    y = Pool(Pool(X, height, kh_size, kh, height2), width, kw_size, kw, width2)
    return y.reduce(ops.max, frozenset({kh, kw}))

In [27]:
X = random_tensor(OrderedDict([("width", Bint[9]), ("height", Bint[4])]))
Y = MaxPool2d(X, "height", 2, "kh", "height2", "width", 3, "kw", "width2")
Y

Tensor(tensor([[0.7460, 1.0811],
               [1.7761, 0.7992],
               [0.6320, 0.8850]]), {'width2': Bint[3], 'height2': Bint[2]})

### Normalization layers

$$
\begin{aligned}
  \mathop{\underset{\substack{\mathsf{\vphantom{fg}ax}}}{\vphantom{fg}\mathrm{standardize}}} \colon \mathbb{R}^{\mathsf{\vphantom{fg}ax}} &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}ax}} \\
  \mathop{\underset{\substack{\mathsf{\vphantom{fg}ax}}}{\vphantom{fg}\mathrm{standardize}}}(X) &= \frac{X - \mathop{\underset{\substack{\mathsf{\vphantom{fg}ax}}}{\vphantom{fg}\mathrm{mean}}}(X)}{\sqrt{\mathop{\underset{\substack{\mathsf{\vphantom{fg}ax}}}{\vphantom{fg}\mathrm{var}}}(X) + \epsilon}}
\end{aligned}
$$

In [28]:
@make_funsor
def Mean(
    X: Has[{"ax"}],
    ax: Bound
) -> Fresh[lambda X: X]:
    return X.reduce(ops.add, ax) / ax.output.size

@make_funsor
def Mean2(
    X: Has[{"ax", "ax2"}],
    ax: Bound,
    ax2: Bound
) -> Fresh[lambda X: X]:
    return X.reduce(ops.add, frozenset({ax, ax2})) / (ax.output.size * ax2.output.size)

@make_funsor
def Variance(
    X: Has[{"ax"}],
    ax: Bound
) -> Fresh[lambda X: X]:
    return Mean((X - Mean(X, ax))**2, ax)


@make_funsor
def Variance2(
    X: Has[{"ax", "ax2"}],
    ax: Bound,
    ax2: Bound
) -> Fresh[lambda X: X]:
    return Mean2((X - Mean2(X, ax, ax2))**2, ax, ax2)

In [29]:
@make_funsor
def Standardize(
    X: Has[{"ax"}],
    ax: Bound,
    new_ax: Fresh[lambda ax: ax]
) -> Fresh[lambda X: X]:
    y = X(**{ax.name: new_ax})
    return (y - Mean(X, ax)) / (Variance(X, ax) + ops.finfo(X.data).eps).sqrt()

@make_funsor
def Standardize2(
    X: Has[{"ax", "ax2"}],
    ax: Bound,
    ax2: Bound,
    new_ax: Fresh[lambda ax: ax],
    new_ax2: Fresh[lambda ax2: ax2]
) -> Fresh[lambda X: X]:
    y = X(**{ax.name: new_ax, ax2.name: new_ax2})
    return (y - Mean2(X, ax, ax2)) / (Variance2(X, ax, ax2) + ops.finfo(X.data).eps).sqrt()

$$
\begin{aligned}
\text{BatchNorm}(X; \gamma, \beta) &= \mathop{\underset{\substack{\mathsf{\vphantom{fg}batch},\mathsf{\vphantom{fg}layer}}}{\vphantom{fg}\mathrm{standardize}}}(X) \mathbin{\underset{\substack{}}{\vphantom{fg}\odot}} \gamma + \beta & \gamma, \beta &\in \mathbb{R}^{\mathsf{\vphantom{fg}chans}} \\
\text{InstanceNorm}(X; \gamma, \beta) &= \mathop{\underset{\substack{\mathsf{\vphantom{fg}layer}}}{\vphantom{fg}\mathrm{standardize}}}(X) \mathbin{\underset{\substack{}}{\vphantom{fg}\odot}} \gamma + \beta & \gamma, \beta &\in \mathbb{R}^{\mathsf{\vphantom{fg}chans}} \\
\text{LayerNorm}(X; \gamma, \beta) &= \mathop{\underset{\substack{\mathsf{\vphantom{fg}layer},\mathsf{\vphantom{fg}chans}}}{\vphantom{fg}\mathrm{standardize}}}(X) \mathbin{\underset{\substack{}}{\vphantom{fg}\odot}} \gamma + \beta & \gamma, \beta &\in \mathbb{R}^{\mathsf{\vphantom{fg}chans},\mathsf{\vphantom{fg}layer}}
\end{aligned}
$$

In [30]:
@make_funsor
def BatchNorm(
    X: Has[{"batch", "layer"}],
    gamma: Funsor,
    beta: Funsor,
    batch: Bound,
    layer: Bound,
    batch2: Fresh[lambda batch: batch],
    layer2: Fresh[lambda layer: layer]
) -> Fresh[lambda X: X]:
    return Standardize2(X, batch, layer, batch2, layer2) * gamma + beta

@make_funsor
def InstanceNorm(
    X: Has[{"layer"}],
    gamma: Funsor,
    beta: Funsor,
    layer: Bound,
    layer2: Fresh[lambda layer: layer]
) -> Fresh[lambda X: X]:
    return Standardize(X, layer, layer2) * gamma + beta

# same as BatchNorm
@make_funsor
def LayerNorm(
    X: Has[{"chans", "layer"}],
    gamma: Funsor,
    beta: Funsor,
    chans: Bound,
    layer: Bound,
    chans2: Fresh[lambda chans: chans],
    layer2: Fresh[lambda layer: layer]
) -> Fresh[lambda X: X]:
    return Standardize2(X, chans, layer, chans2, layer2) * gamma + beta

In [31]:
x = random_tensor(OrderedDict([("batch", Bint[4]), ("chans", Bint[3]), ("layer", Bint[5])]))
g = random_tensor(OrderedDict([("chans", Bint[3])]))
b = random_tensor(OrderedDict([("chans", Bint[3])]))

BatchNorm(x, g, b, "batch", "layer", "batch2", "layer2")

Tensor(tensor([[[-0.4167, -2.0149,  1.5869, -0.9540, -2.4599],
                [-0.9972, -0.7414, -0.8085, -0.8597, -0.6717],
                [ 0.0374,  0.4729, -1.5694,  0.0670,  1.4730]],
       
               [[-0.9653,  3.6068,  0.9339,  0.7039,  0.1397],
                [-0.9462, -0.9947, -0.9195, -0.8253, -0.7915],
                [-0.3312, -0.6858, -0.7024,  0.6572, -1.3472]],
       
               [[ 0.2299,  2.3195, -2.4054, -1.9045,  1.1335],
                [-1.1645, -0.5576, -0.8411, -0.6485, -1.0886],
                [ 1.8121, -0.0398, -0.4208, -1.7959, -2.5694]],
       
               [[-2.3248,  0.4525,  0.6899, -1.6993,  3.3501],
                [-0.9168, -0.9505, -0.8369, -1.0205, -0.7951],
                [-0.6245, -0.0239,  1.2254, -0.1788,  0.0094]]]), {'batch2': Bint[4], 'chans': Bint[3], 'layer2': Bint[5]})

$$
\begin{aligned}
\text{GroupNorm}_k(X; \gamma, \beta) &= \left[ \mathop{\underset{\substack{\mathsf{\vphantom{fg}kernel},\mathsf{\vphantom{fg}layer}}}{\vphantom{fg}\mathrm{standardize}}} \mathop{\underset{\substack{\mathsf{\vphantom{fg}chans}, \mathsf{\vphantom{fg}kernel}}}{\vphantom{fg}\mathrm{pool}}} X \right]_{(\mathsf{\vphantom{fg}chans},\mathsf{\vphantom{fg}kernel})\rightarrow \mathsf{\vphantom{fg}chans}} \mathbin{\underset{\substack{}}{\vphantom{fg}\odot}} \gamma + \beta \\
\end{aligned}
$$

$$
\begin{aligned}
|\mathsf{\vphantom{fg}kernel}| &= k\\
\gamma, \beta &\in \mathbb{R}^{\mathsf{\vphantom{fg}chans}}.
\end{aligned}
$$

## Transformer

$$
\begin{aligned}
  I &\in \{0, 1\}^{\mathsf{\vphantom{fg}seq}\times \mathsf{\vphantom{fg}vocab}} & \sum\limits_{\substack{\mathsf{\vphantom{fg}vocab}}} I &= 1 \\
  W &= (E \mathbin{\underset{\substack{\mathsf{\vphantom{fg}vocab}}}{\vphantom{fg}\odot}} I)\sqrt{|\mathsf{\vphantom{fg}layer}|} & E &\in \mathbb{R}^{\mathsf{\vphantom{fg}vocab}\times \mathsf{\vphantom{fg}layer}} \\
  P &\in \mathbb{R}^{\mathsf{\vphantom{fg}seq}\times \mathsf{\vphantom{fg}layer}} \\
  P_{\mathsf{\vphantom{fg}seq}(p), \mathsf{\vphantom{fg}layer}(i)} &= \begin{cases}
    \sin((p-1) / 10000^{(i-1) / |\mathsf{\vphantom{fg}layer}|}) & \text{$i$ odd} \\ 
    \cos((p-1) / 10000^{(i-2) / |\mathsf{\vphantom{fg}layer}|}) & \text{$i$ even.}
  \end{cases}
\end{aligned}
$$

$$
\begin{aligned}
X^0 &= W+P \\
T^1 &= \text{LayerNorm}^1(\text{SelfAtt}^1(X^0)) + X^0\\
X^1 &= \text{LayerNorm}^{1^\prime}(\text{FFN}^1(T^1)) + T^1\\
&\vdotswithin{=} \\
T^{L} &= \text{LayerNorm}^L(\text{SelfAtt}^L(X^{L-1})) + X^{L-1}\\
X^{L} &= \text{LayerNorm}^{L^\prime}(\text{FFN}^L(T^L)) + T^L\\
O &= \mathop{\underset{\substack{\mathsf{\vphantom{fg}vocab}}}{\vphantom{fg}\mathrm{softmax}}}(E \mathbin{\underset{\substack{\mathsf{\vphantom{fg}layer}}}{\vphantom{fg}\odot}} X^L)
\end{aligned}
$$

$$
\begin{aligned}
  \text{LayerNorm}^l \colon \mathbb{R}^{\mathsf{\vphantom{fg}layer}} &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}layer}} \\
  \text{LayerNorm}^l(X) &= \mathop{\underset{\substack{\mathsf{\vphantom{fg}layer}}}{\vphantom{fg}\mathrm{XNorm}}}(X; \beta^l, \gamma^l).
\end{aligned}
$$

$$
\begin{aligned}
  \text{SelfAtt}^l \colon \mathbb{R}^{\mathsf{\vphantom{fg}seq}\times \mathsf{\vphantom{fg}layer}} &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}seq}\times \mathsf{\vphantom{fg}layer}} \\
  \text{SelfAtt}^l(X) &= Y
\end{aligned}
$$

$$
\begin{aligned}
  |\mathsf{\vphantom{fg}seq}| &= |\mathsf{\vphantom{fg}seq2}| \\
  |\mathsf{\vphantom{fg}key}| = |\mathsf{\vphantom{fg}val}| &= |\mathsf{\vphantom{fg}layer}|/|\mathsf{\vphantom{fg}heads}| \\
  Q &= W^{l,Q} \mathbin{\underset{\substack{\mathsf{\vphantom{fg}layer}}}{\vphantom{fg}\odot}} X_{\mathsf{\vphantom{fg}seq}\rightarrow\mathsf{\vphantom{fg}seq2}} & W^{l,Q} &\in \mathbb{R}^{\mathsf{\vphantom{fg}heads}\times \mathsf{\vphantom{fg}layer}\times \mathsf{\vphantom{fg}key}} \\
  K &= W^{l,K} \mathbin{\underset{\substack{\mathsf{\vphantom{fg}layer}}}{\vphantom{fg}\odot}} X & W^{l,K} &\in \mathbb{R}^{\mathsf{\vphantom{fg}heads}\times \mathsf{\vphantom{fg}layer}\times \mathsf{\vphantom{fg}key}} \\
  V &= W^{l,V} \mathbin{\underset{\substack{\mathsf{\vphantom{fg}layer}}}{\vphantom{fg}\odot}} X & W^{l,V} &\in \mathbb{R}^{\mathsf{\vphantom{fg}heads}\times \mathsf{\vphantom{fg}layer}\times \mathsf{\vphantom{fg}val}} \\
  M & \in \mathbb{R}^{\mathsf{\vphantom{fg}seq}\times \mathsf{\vphantom{fg}seq2}} \\
  M_{\mathsf{\vphantom{fg}seq}(i), \mathsf{\vphantom{fg}seq2}(j)} &= \begin{cases}
    0 & i \leq j\\
    -\infty & \text{otherwise}
  \end{cases} \\
  Y &= W^{l,O} \mathbin{\underset{\substack{\mathsf{\vphantom{fg}heads}\\ \mathsf{\vphantom{fg}val}}}{\vphantom{fg}\odot}} \text{Attention}(Q, K, V, M)_{\mathsf{\vphantom{fg}seq2}\rightarrow\mathsf{\vphantom{fg}seq}} & W^{l,O} &\in \mathbb{R}^{\mathsf{\vphantom{fg}heads}\times \mathsf{\vphantom{fg}val}\times \mathsf{\vphantom{fg}layer}}
\end{aligned}
$$

$$
\begin{aligned}
  \text{FFN}^l \colon \mathbb{R}^{\mathsf{\vphantom{fg}layer}} &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}layer}} \\
  \text{FFN}^l(X) &= X^2
\end{aligned}
$$

$$
\begin{aligned}
  X^1 &= \text{relu}(W^{l,1} \mathbin{\underset{\substack{\mathsf{\vphantom{fg}layer}}}{\vphantom{fg}\odot}} X + b^{l,1}) & W^{l,1} &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}\times \mathsf{\vphantom{fg}layer}} & b^{l,1} &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}} \\
  X^2 &= \text{relu}(W^{l,2} \mathbin{\underset{\substack{\mathsf{\vphantom{fg}hidden}}}{\vphantom{fg}\odot}} X^1 + b^{l,2}) & W^{l,2} &\in \mathbb{R}^{\mathsf{\vphantom{fg}layer}\times \mathsf{\vphantom{fg}hidden}} & b^{l,2} &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}}.
\end{aligned}
$$

## LeNet

$$
\begin{aligned}
X^0 &\in \mathbb{R}^{\mathsf{\vphantom{fg}batch}\times \mathsf{\vphantom{fg}chans}[c_0] \times \mathsf{\vphantom{fg}height}\times \mathsf{\vphantom{fg}width}} \\
T^1 &= \text{relu}(\text{Conv}^1(X^0)) \\
X^1 &= \text{MaxPool}^1(T^1) \\
T^2 &= \text{relu}(\text{Conv}^2(X^1)) \\
X^2 &= \text{MaxPool}^2(T^2)_{(\mathsf{\vphantom{fg}height},\mathsf{\vphantom{fg}width},\mathsf{\vphantom{fg}chans})\rightarrow\mathsf{\vphantom{fg}layer}} \\
X^3 &= \text{relu}(W^3 \mathbin{\underset{\substack{\mathsf{\vphantom{fg}layer}}}{\vphantom{fg}\odot}} X^2 + b^3) & W^3 &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}\times \mathsf{\vphantom{fg}layer}} & b^3 &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}} \\
O &= \mathop{\underset{\substack{\mathsf{\vphantom{fg}classes}}}{\vphantom{fg}\mathrm{softmax}}} (W^4 \mathbin{\underset{\substack{\mathsf{\vphantom{fg}hidden}}}{\vphantom{fg}\odot}} X^3 + b^4) & W^4 &\in \mathbb{R}^{\mathsf{\vphantom{fg}classes}\times \mathsf{\vphantom{fg}hidden}} & b^4 &\in \mathbb{R}^{\mathsf{\vphantom{fg}classes}}\end{aligned}
$$

$$
\begin{aligned}
X^2 &= \text{MaxPool}^2(T^2) \\
X^3 &= \text{relu}(W^3 \mathbin{\underset{\substack{\mathsf{\vphantom{fg}height}\\ \mathsf{\vphantom{fg}width}\\ \mathsf{\vphantom{fg}chans}}}{\vphantom{fg}\odot}} X^2 + b^3) & W^3 &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}\times \mathsf{\vphantom{fg}height}\times \mathsf{\vphantom{fg}width}\times \mathsf{\vphantom{fg}chans}}.
\end{aligned}
$$

$$
\begin{aligned}
\text{Conv}^l(X) &= \text{Conv2d}(X; W^l, b^l)_{\mathsf{\vphantom{fg}chans2}\rightarrow\mathsf{\vphantom{fg}chans}}
\end{aligned}
$$

$$
\begin{aligned}
W^l & \in \mathbb{R}^{\mathsf{\vphantom{fg}chans2}[c_l] \times \mathsf{\vphantom{fg}chans}[c_{l-1}] \times \mathsf{\vphantom{fg}kh}[kh_l] \times \mathsf{\vphantom{fg}kw}[kw_l]} \\
b^l &\in \mathbb{R}^{\mathsf{\vphantom{fg}chans2}[c_l]}
\end{aligned}
$$

$$
\begin{aligned}
\text{MaxPool}^l(X) &amp;= \text{MaxPool2d}_{ph^l,ph^l}(X).
\end{aligned}
$$

In [79]:
@make_funsor
def Relu(
    X: Funsor
) -> Fresh[lambda X: X]:
    return ops.max(X, Number(0.0))

In [80]:
W1 = random_tensor(
    OrderedDict([
        ("chans", Bint[3]),
        ("kh", Bint[3]),
        ("kw", Bint[4]),
        ("chans2", Bint[3])
    ]),
)
b1 = random_tensor(OrderedDict([("chans2", Bint[3])]))
W3 = random_tensor(
    OrderedDict([
        ("hidden", Bint[3]),
        ("height3", Bint[4]),
        ("width3", Bint[4]),
        ("chans2", Bint[3])
    ]),
)
b3 = random_tensor(OrderedDict([("hidden", Bint[3])]))
W4 = random_tensor(
    OrderedDict([
        ("hidden", Bint[3]),
        ("classes", Bint[5]),
    ]),
)
b4 = random_tensor(OrderedDict([("classes", Bint[5])]))

X0 = random_tensor(
    OrderedDict([
        ("batch", Bint[4]),
        ("chans", Bint[3]),
        ("height", Bint[14]),
        ("width", Bint[15])
    ])
)

T1 = Relu(
    Conv2d(X0, W1, b1, "chans", 3, "kh", "height", "height2", 4, "kw", "width", "width2")
)
X1 = MaxPool2d(T1, "height2", 3, "kh", "height3", "width2", 3, "kw", "width3")
X3 = (W3 * X1).reduce(ops.add, frozenset({"height3", "width3", "chans2"})) + b3
O = Softmax(((W4 * X3).reduce(ops.add, "hidden") + b4), "classes", "classes2")