# Introduction

In [1]:
from collections import OrderedDict
import functools

import math
import torch
from torch.distributions import constraints

%env FUNSOR_TYPECHECK=1
import funsor
from funsor.terms import Funsor, Variable, Number, Lambda, Slice
from funsor.tensor import Tensor
from funsor.domains import Array, Bint, Real, Reals
from funsor.factory import Bound, Fresh, Has, Value, make_funsor, to_funsor
import funsor.ops as ops
from funsor.cnf import Contraction
from funsor.testing import random_tensor
from funsor.interpretations import reflect, memoize
import funsor.torch.distributions as dist

funsor.set_backend("torch")
torch.set_default_dtype(torch.float32)

env: FUNSOR_TYPECHECK=1


# Examples

## Building blocks

### Feedforward neural networks

\begin{aligned}
  X^0 &\in \mathbb{R}^{\mathsf{\vphantom{fg}input}} \\
  X^1 &= \sigma(W^1 \mathbin{\underset{\substack{\mathsf{\vphantom{fg}input}}}{\vphantom{fg}\odot}} X^0 + b^1) & W^1 &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}_1 \times \mathsf{\vphantom{fg}input}} & b^1 &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}_1} \\
  X^2 &= \sigma(W^2 \mathbin{\underset{\substack{\mathsf{\vphantom{fg}hidden}_1}}{\vphantom{fg}\odot}} X^1 + b^2) & W^2 &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}_2 \times \mathsf{\vphantom{fg}hidden}_1} & b^2 &\in \mathbb{R}^{\mathsf{\vphantom{fg}hidden}_2} \\
  X^3 &= \sigma(W^3 \mathbin{\underset{\substack{\mathsf{\vphantom{fg}hidden}_2}}{\vphantom{fg}\odot}} X^2 + b^3) & W^3 &\in \mathbb{R}^{\mathsf{\vphantom{fg}out}\times \mathsf{\vphantom{fg}hidden}_2} & b^3 &\in \mathbb{R}^{\mathsf{\vphantom{fg}out}}
\end{aligned}

In [19]:
input_dim = 100
X0 = random_tensor(
    OrderedDict([("input_layer", Bint[input_dim])])
)

hidden_1_dim = 32
W1 = random_tensor(
    OrderedDict([
        ("input_layer", Bint[input_dim]),
        ("hidden_layer_1", Bint[hidden_1_dim])
    ])
)
b1 = random_tensor(
    OrderedDict([("hidden_layer_1", Bint[hidden_1_dim])])
)
X1 = ((W1 * X0).reduce(ops.add, "input_layer") + b1).sigmoid()

hidden_2_dim = 16
W2 = random_tensor(
    OrderedDict([
        ("hidden_layer_1", Bint[hidden_1_dim]),
        ("hidden_layer_2", Bint[hidden_2_dim])
    ])
)
b2 = random_tensor(
    OrderedDict([("hidden_layer_2", Bint[hidden_2_dim])])
)
X2 = ((W2 * X1).reduce(ops.add, "hidden_layer_1") + b2).sigmoid()

hidden_3_dim = 8
W3 = random_tensor(
    OrderedDict([
        ("hidden_layer_2", Bint[hidden_2_dim]),
        ("hidden_layer_3", Bint[hidden_3_dim])
    ])
)
b3 = random_tensor(
    OrderedDict([("hidden_layer_3", Bint[hidden_3_dim])])
)
X3 = ((W3 * X2).reduce(ops.add, "hidden_layer_2") + b3).sigmoid()

In [20]:
@make_funsor
def FullConnLayer(
    x: Has[{"layer"}],
    W: Has[{"layer"}],
    b: Funsor,
    layer: Bound
) -> Fresh[lambda x: x]:
    result = ((W * x).reduce(ops.add, layer) + b).sigmoid()
    return result

In [21]:
input_dim = 100
X0 = random_tensor(
    OrderedDict([("layer", Bint[input_dim])])
)
hidden_1_dim = 32
W1 = random_tensor(
    OrderedDict([
        ("layer", Bint[input_dim]),
        ("out_layer", Bint[hidden_1_dim])
    ])
)
b1 = random_tensor(
    OrderedDict([("out_layer", Bint[hidden_1_dim])])
)

X1 = FullConnLayer(X0, W1, b1, "layer")
X1

Tensor(tensor([6.7379e-03, 6.5177e-02, 9.8971e-01, 9.9932e-01, 1.0000e+00, 9.1053e-01,
        1.0000e+00, 2.1048e-03, 1.2270e-02, 5.3009e-01, 9.9986e-01, 1.8652e-02,
        2.8300e-01, 9.9843e-01, 9.9932e-01, 9.9806e-01, 9.4389e-01, 9.9555e-01,
        9.9038e-01, 2.9201e-05, 1.9454e-01, 7.2684e-01, 9.9984e-01, 1.0000e+00,
        1.4294e-01, 3.8029e-04, 3.7336e-03, 4.4616e-05, 1.0000e+00, 9.9998e-01,
        4.2059e-01, 2.0911e-10]), OrderedDict([('out_layer', Bint[32, ])]), 'real')

### Recurrent neural networks

In [22]:
# TODO
@make_funsor
def RecurrentLayer(
    x: Funsor,
    Wh: Funsor,
    Wi: Funsor,
    b: Funsor,
    hidden: Bound,
    input: Bound
) -> Fresh[lambda x: x]:
    output = ((Wh * h).reduce(ops.add, "hidden") + (Wi * x).reduce(ops.add, "input") + b).sigmoid()
    return output(hidden="new_hidden")

### Attention

In [22]:
@make_funsor
def Softmax(
    x: Funsor,
    ax: Bound,
    ax2: Fresh[lambda ax: ax]
) -> Fresh[lambda x: x]:
    x = x(**{ax.name: ax2.name})
    y = x - x.reduce(ops.logaddexp, ax2)
    return y.exp()

\begin{aligned}
  \text{Attention} \colon \mathbb{R}^{\mathsf{\vphantom{fg}key}} \times \mathbb{R}^{\mathsf{\vphantom{fg}seq}\times\mathsf{\vphantom{fg}key}} \times \mathbb{R}^{\mathsf{\vphantom{fg}seq}\times\mathsf{\vphantom{fg}val}} \times \mathbb{R}^{\mathsf{\vphantom{fg}seq}} &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}val}} \\
\text{Attention}(Q, K, V, M) &= \mathop{\underset{\substack{\mathsf{\vphantom{fg}seq}}}{\vphantom{fg}\mathrm{softmax}}} \left( \frac{Q \mathbin{\underset{\substack{\mathsf{\vphantom{fg}key}}}{\vphantom{fg}\odot}} K}{\sqrt{|\mathsf{\vphantom{fg}key}|}} + M \right) \mathbin{\underset{\substack{\mathsf{\vphantom{fg}seq}}}{\vphantom{fg}\odot}} V.
\end{aligned}

In [26]:
@make_funsor
def Attention(
    Q: Has[{"key"}],
    K: Has[{"key", "seq"}],
    V: Has[{"seq2"}],
    M: Has[{"seq"}],
    key: Bound,
    seq: Bound,
    seq2: Bound
) -> Fresh[lambda Q: Q]:
    x = (Q * K).reduce(ops.add, key) / math.sqrt(key.output.size) + M
    return (Softmax(x, seq, seq2) * V).reduce(ops.add, seq2)

In [27]:
q = random_tensor(OrderedDict([("key", Bint[10])]))
k = random_tensor(OrderedDict([("key", Bint[10]), ("seq", Bint[3])]))
v = random_tensor(OrderedDict([("seq2", Bint[3]), ("val", Bint[5])]))
m = random_tensor(OrderedDict([("seq", Bint[3])]))
Attention(q, k, v, m, "key", "seq", "seq2")

Tensor(tensor([ 0.5775, -0.4559,  0.4790,  0.6897,  0.2296]), OrderedDict([('val', Bint[5, ])]), 'real')

### Convolution

\begin{aligned}
  \mathop{\underset{\substack{\mathsf{\vphantom{fg}seq}\\ \mathsf{\vphantom{fg}kernel}}}{\vphantom{fg}\mathrm{unroll}}} \colon \mathbb{R}^{\mathsf{\vphantom{fg}seq}[n]} &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}seq}[n-|\mathsf{\vphantom{fg}kernel}|+1], \mathsf{\vphantom{fg}kernel}} \\
  \mathop{\underset{\substack{\mathsf{\vphantom{fg}seq}\\ \mathsf{\vphantom{fg}kernel}}}{\vphantom{fg}\mathrm{unroll}}} X &= Y,\ \text{where} \\
  Y_{\mathsf{\vphantom{fg}seq}(i), \mathsf{\vphantom{fg}kernel}(j)} &= X_{\mathsf{\vphantom{fg}seq}(i+j - 1)}.
\end{aligned}

In [28]:
@make_funsor
def Unroll(
    x: Has[{"seq"}],
    seq: Bound,
    k: Value[int],
    kernel: Fresh[lambda k: Bint[k]],
    seq2: Fresh[lambda seq, k: Bint[seq.size - k + 1]]
) -> Fresh[lambda x: x]:
    return x(**{seq.name: seq2 + kernel})

\begin{aligned}
\text{Conv1d} \colon \mathbb{R}^{\mathsf{\vphantom{fg}chans}\times \mathsf{\vphantom{fg}seq}[n]} &\rightarrow \mathbb{R}^{\mathsf{\vphantom{fg}seq}[n^\prime]} \\
\text{Conv1d}(X; W, b) &= W \mathbin{\underset{\substack{\mathsf{\vphantom{fg}chans}\\ \mathsf{\vphantom{fg}kernel}}}{\vphantom{fg}\odot}} \mathop{\underset{\substack{\mathsf{\vphantom{fg}seq}\\ \mathsf{\vphantom{fg}kernel}}}{\vphantom{fg}\mathrm{unroll}}} X + b
\end{aligned}

\begin{aligned}
W &\in \mathbb{R}^{\mathsf{\vphantom{fg}chans}\times \mathsf{\vphantom{fg}kernel}} \\
b &\in \mathbb{R}\\
\end{aligned}

In [31]:
@make_funsor
def Conv1d(
    X: Has[{"chans", "seq"}],
    W: Has[{"chans", "kernel"}],
    b: Funsor,
    chans: Bound,
    k: Value[int],
    kernel: Bound,
    seq: Bound,
    seq2: Fresh[lambda seq, k: Bint[seq.size - k + 1]]
) -> Fresh[lambda X: X]:
    y = W * Unroll(X, seq, k, kernel, seq2)
    return y.reduce(ops.add, frozenset({chans, kernel})) + b

In [32]:
x = random_tensor(OrderedDict([("chans", Bint[3]), ("seq", Bint[10])]))
kernel = Variable("kernel", Bint[3])
w = random_tensor(OrderedDict([("chans", Bint[3]), ("kernel", Bint[3])]))
b = random_tensor(OrderedDict([]))

In [33]:
Conv1d(x, w, b, "chans", 3, "kernel", "seq", "seq2")

Tensor(tensor([ 2.3575,  0.3245, -3.2575,  0.5397,  1.4327, -1.0565, -1.4984,  1.0468]), OrderedDict([('seq2', Bint[8, ])]), 'real')

In [34]:
@make_funsor
def Conv2d(
    X: Has[{"chans", "height", "width"}],
    W: Has[{"chans", "kh", "kw"}],
    b: Funsor,
    chans: Bound,
    kh_size: Value[int],
    kh: Bound,
    height: Bound,
    height2: Fresh[lambda height, kh_size: Bint[height.size - kh_size + 1]],
    kw_size: Value[int],
    kw: Bound,
    width: Bound,
    width2: Fresh[lambda width, kw_size: Bint[width.size - kw_size + 1]]
) -> Fresh[lambda X: X]:
    y = W * Unroll(Unroll(X, width, kw_size, kw, width2), height, kh_size, kh, height2)
    return y.reduce(ops.add, frozenset({chans, kh, kw})) + b

In [35]:
x = random_tensor(OrderedDict([("chans", Bint[3]), ("height", Bint[10]), ("width", Bint[8])]))
w = random_tensor(OrderedDict([("chans", Bint[3]), ("kh", Bint[3]), ("kw", Bint[4])]))
b = random_tensor(OrderedDict([]))

Conv2d(x, w, b, "chans", 3, "kh", "height", "height2", 4, "kw", "width", "width2")

Tensor(tensor([[-6.5971,  8.6155,  4.7653,  4.5988,  3.9547],
        [ 3.2545,  5.7080,  2.5373,  0.5605, 10.9278],
        [ 5.1632,  8.2678,  7.8185,  3.3192, -2.3040],
        [-1.8145,  2.7405, -8.2361,  3.2329, 13.0263],
        [-3.6424, 16.5613,  4.2129,  5.0539,  0.5886],
        [ 1.2888,  4.2768, -1.1993, -1.6411,  8.0008],
        [ 0.6596, 12.1573, -3.1636,  2.7850,  2.2961],
        [ 5.0141,  5.8857,  8.3560,  0.7796, 14.1677]]), OrderedDict([('height2', Bint[8, ]), ('width2', Bint[5, ])]), 'real')

### Max pooling

In [36]:
@make_funsor
def Pool(
    x: Has[{"seq"}],
    seq: Bound,
    k: Value[int],
    kernel: Fresh[lambda k: Bint[k]],
    seq2: Fresh[lambda seq, k: Bint[seq.size // k]], # seq -> Bint[]
) -> Fresh[lambda x: x]: # x -> x.output (Bint[] or Real)
    assert not seq.output.size % k
    return x(**{seq.name: seq2 * Number(k, k+1) + kernel})

In [37]:
X = random_tensor(OrderedDict([("seq", Bint[10])]))
Y = Pool(X, "seq", 2, "kernel", "seq2")
Y

Tensor(tensor([[-0.5920, -1.4516],
        [-1.5946, -0.0205],
        [-1.4622, -0.9688],
        [ 0.2946,  0.4871],
        [-1.1216, -0.0153]]), OrderedDict([('seq2', Bint[5, ]), ('kernel', Bint[2, ])]), 'real')

In [38]:
@make_funsor
def MaxPool1d(
    X: Has[{"seq"}],
    seq: Bound,
    k: Value[int],
    kernel: Fresh[lambda k: Bint[k]],
    seq2: Fresh[lambda seq, k: Bint[seq.size // k]]
) -> Fresh[lambda X: X]:
    return Pool(X, seq, k, kernel, seq2).reduce(ops.max, kernel)

In [39]:
X = random_tensor(OrderedDict([("seq", Bint[10])]))
Y = MaxPool1d(X, "seq", 2, "kernel", "seq2")
Y

Tensor(tensor([-0.1638,  0.8504,  0.7523, -1.0736,  0.0528]), OrderedDict([('seq2', Bint[5, ])]), 'real')

In [40]:
@make_funsor
def MaxPool2d(
    X: Has[{"height", "width"}],
    height: Bound,
    kh_size: Value[int],
    kh: Fresh[lambda kh_size: Bint[kh_size]],
    height2: Fresh[lambda height, kh_size: Bint[height.size // kh_size]],
    width: Bound,
    kw_size: Value[int],
    kw: Fresh[lambda kw_size: Bint[kw_size]],
    width2: Fresh[lambda width, kw_size: Bint[width.size // kw_size]],
) -> Fresh[lambda X: X]:
    y = Pool(Pool(X, height, kh_size, kh, height2), width, kw_size, kw, width2)
    return y.reduce(ops.max, frozenset({kh, kw}))

In [41]:
X = random_tensor(OrderedDict([("width", Bint[9]), ("height", Bint[4])]))
Y = MaxPool2d(X, "height", 2, "kh", "height2", "width", 3, "kw", "width2")
Y

Tensor(tensor([[0.6025, 1.3002],
        [0.4977, 1.1604],
        [0.7279, 0.8614]]), OrderedDict([('width2', Bint[3, ]), ('height2', Bint[2, ])]), 'real')

In [42]:
@make_funsor
def Pool2(
    x: Funsor,
    seq: Bound,
    kernel: Funsor,
    seq2: Fresh[lambda seq, kernel: Bint[seq.size // kernel.size]], # seq -> Bint[]
) -> Fresh[lambda x: x]: # x -> x.output (Bint[] or Real)
    return x(**{seq.name: seq2 * Number(kernel.output.size, kernel.output.size+1) + kernel})

### Normalization layers

In [53]:
# version 1
@make_funsor
def Mean(
    X: Has[{"ax"}],
    ax: Bound
) -> Fresh[lambda X: X]:
    return ops.mean(funsor.terms.Lambda(ax, X), 0)

@make_funsor
def Variance(
    X: Has[{"ax"}],
    ax: Bound
) -> Fresh[lambda X: X]:
    return ops.var(funsor.terms.Lambda(ax, X), 0)

In [58]:
# version 2
@make_funsor
def Mean(
    X: Has[{"ax"}],
    ax: Bound
) -> Fresh[lambda X: X]:
    return X.reduce(ops.add, ax) / ax.output.size

@make_funsor
def Mean2(
    X: Has[{"ax", "ax2"}],
    ax: Bound,
    ax2: Bound
) -> Fresh[lambda X: X]:
    return X.reduce(ops.add, frozenset({ax, ax2})) / (ax.output.size * ax2.output.size)

@make_funsor
def Variance(
    X: Has[{"ax"}],
    ax: Bound
) -> Fresh[lambda X: X]:
    return Mean((X - Mean(X, ax))**2, ax)


@make_funsor
def Variance2(
    X: Has[{"ax", "ax2"}],
    ax: Bound,
    ax2: Bound
) -> Fresh[lambda X: X]:
    return Mean2((X - Mean2(X, ax, ax2))**2, ax, ax2)

In [59]:
@make_funsor
def Standardize(
    X: Has[{"ax"}],
    ax: Bound,
    new_ax: Fresh[lambda ax: ax]
) -> Fresh[lambda X: X]:
    y = X(**{ax.name: new_ax})
    return (y - Mean(X, ax)) / (Variance(X, ax) + ops.finfo(X.data).eps).sqrt()

@make_funsor
def Standardize2(
    X: Has[{"ax", "ax2"}],
    ax: Bound,
    ax2: Bound,
    new_ax: Fresh[lambda ax: ax],
    new_ax2: Fresh[lambda ax2: ax2]
) -> Fresh[lambda X: X]:
    y = X(**{ax.name: new_ax, ax2.name: new_ax2})
    return (y - Mean2(X, ax, ax2)) / (Variance2(X, ax, ax2) + ops.finfo(X.data).eps).sqrt()

In [60]:
@make_funsor
def BatchNorm(
    X: Has[{"batch", "layer"}],
    gamma: Funsor,
    beta: Funsor,
    batch: Bound,
    layer: Bound,
    batch2: Fresh[lambda batch: batch],
    layer2: Fresh[lambda layer: layer]
) -> Fresh[lambda X: X]:
    return Standardize2(X, batch, layer, batch2, layer2) * gamma + beta

@make_funsor
def InstanceNorm(
    X: Has[{"layer"}],
    gamma: Funsor,
    beta: Funsor,
    layer: Bound,
    layer2: Fresh[lambda layer: layer]
) -> Fresh[lambda X: X]:
    return Standardize(X, layer, layer2) * gamma + beta

# same as BatchNorm
@make_funsor
def LayerNorm(
    X: Has[{"chans", "layer"}],
    gamma: Funsor,
    beta: Funsor,
    chans: Bound,
    layer: Bound,
    chans2: Fresh[lambda chans: chans],
    layer2: Fresh[lambda layer: layer]
) -> Fresh[lambda X: X]:
    return Standardize2(X, chans, layer, chans2, layer2) * gamma + beta

In [61]:
x = random_tensor(OrderedDict([("batch", Bint[4]), ("chans", Bint[3]), ("layer", Bint[5])]))
g = random_tensor(OrderedDict([("chans", Bint[3])]))
b = random_tensor(OrderedDict([("chans", Bint[3])]))

BatchNorm(x, g, b, "batch", "layer", "batch2", "layer2")

Tensor(tensor([[[-0.5280,  1.0413, -2.6600, -0.4790, -0.6751],
         [-0.7684, -0.0121, -1.3248, -0.7829, -0.2051],
         [-3.5092, -0.9516, -0.2821, -0.7431, -1.6872]],

        [[-1.4081, -0.9501, -3.5667,  0.4102, -1.9272],
         [-0.4956, -0.5375, -0.5311, -0.0411, -0.7753],
         [ 1.2074, -3.1272, -4.1054,  0.2828,  1.2584]],

        [[ 1.7990, -3.8503, -1.8308, -0.9864, -0.0882],
         [-0.8277, -1.0676, -0.4600, -1.3033, -0.7147],
         [ 0.8170, -0.4777,  2.0906, -3.1950, -1.3288]],

        [[ 1.4840,  0.2684, -2.5263,  2.0744,  3.6022],
         [-0.3857, -0.6719, -0.6715, -0.7410, -0.5500],
         [-2.7286, -0.4270, -1.7050, -2.5529, -0.9331]]]), OrderedDict([('batch2', Bint[4, ]), ('chans', Bint[3, ]), ('layer2', Bint[5, ])]), 'real')

In [62]:
InstanceNorm(x, g, b, "layer", "layer2")

Tensor(tensor([[[-0.3284,  2.1825, -3.7395, -0.2500, -0.5638],
         [-0.7521, -0.2031, -1.1559, -0.7626, -0.3432],
         [-4.2041, -0.3833,  0.6169, -0.0718, -1.4822]],

        [[-0.4231,  0.2427, -3.5611,  2.2202, -1.1777],
         [-0.6709, -0.7302, -0.7212, -0.0278, -1.0668],
         [ 0.4637, -2.7675, -3.4967, -0.2255,  0.5017]],

        [[ 2.2742, -3.4231, -1.3864, -0.5348,  0.3710],
         [-0.5886, -0.8684, -0.1596, -1.1435, -0.4567],
         [ 0.0504, -1.1600,  1.2410, -3.7002, -1.9556]],

        [[-0.0780, -1.1931, -3.7567,  0.4636,  1.8650],
         [-0.0538, -0.8267, -0.8256, -1.0133, -0.4975],
         [-3.1105,  1.2472, -1.1724, -2.7777,  0.2889]]]), OrderedDict([('batch', Bint[4, ]), ('chans', Bint[3, ]), ('layer2', Bint[5, ])]), 'real')

## Transformer

## LeNet

In [79]:
@make_funsor
def Relu(
    X: Funsor
) -> Fresh[lambda X: X]:
    return ops.max(X, Number(0.0))

In [80]:
W1 = random_tensor(
    OrderedDict([
        ("chans", Bint[3]),
        ("kh", Bint[3]),
        ("kw", Bint[4]),
        ("chans2", Bint[3])
    ]),
)
b1 = random_tensor(OrderedDict([("chans2", Bint[3])]))
W3 = random_tensor(
    OrderedDict([
        ("hidden", Bint[3]),
        ("height3", Bint[4]),
        ("width3", Bint[4]),
        ("chans2", Bint[3])
    ]),
)
b3 = random_tensor(OrderedDict([("hidden", Bint[3])]))
W4 = random_tensor(
    OrderedDict([
        ("hidden", Bint[3]),
        ("classes", Bint[5]),
    ]),
)
b4 = random_tensor(OrderedDict([("classes", Bint[5])]))

X0 = random_tensor(
    OrderedDict([
        ("batch", Bint[4]),
        ("chans", Bint[3]),
        ("height", Bint[14]),
        ("width", Bint[15])
    ])
)

T1 = Relu(
    Conv2d(X0, W1, b1, "chans", 3, "kh", "height", "height2", 4, "kw", "width", "width2")
)
X1 = MaxPool2d(T1, "height2", 3, "kh", "height3", "width2", 3, "kw", "width3")
X3 = (W3 * X1).reduce(ops.add, frozenset({"height3", "width3", "chans2"})) + b3
O = Softmax(((W4 * X3).reduce(ops.add, "hidden") + b4), "classes", "classes2")

In [81]:
O

Tensor(tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 3.2374e-23, 2.6446e-35, 5.3192e-34],
        [1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00]]), OrderedDict([('classes2', Bint[5, ]), ('batch', Bint[4, ])]), 'real')

## Other examples

### Discrete random variables

\begin{aligned}
p(\mathsf{\vphantom{fg}B} \mid \mathsf{\vphantom{fg}A}) &\in [0, 1]^{\mathsf{\vphantom{fg}A} \times \mathsf{\vphantom{fg}B}} & \sum\limits_{\substack{\mathsf{\vphantom{fg}B}}} p(\mathsf{\vphantom{fg}B} \mid \mathsf{\vphantom{fg}A}) &= 1 \\
p(\mathsf{\vphantom{fg}A}) &\in [0, 1]^{\mathsf{\vphantom{fg}A}} & \sum\limits_{\substack{\mathsf{\vphantom{fg}A}}} p(\mathsf{\vphantom{fg}A}) &= 1
\end{aligned}

In [18]:
pB_given_A = Tensor(
    torch.tensor([[0.2, 0.3, 0.5],
                  [0.8, 0.1, 0.1]]),
)["A", "B"]

A = Tensor(
    torch.tensor([0.6, 0.4]),
)["A"]

\begin{aligned}
p(\mathsf{\vphantom{fg}A}, \mathsf{\vphantom{fg}B}) &= p(\mathsf{\vphantom{fg}B} \mid \mathsf{\vphantom{fg}A}) \odot p(\mathsf{\vphantom{fg}A}) && \text{chain rule}\\
p(\mathsf{\vphantom{fg}B}) &= \sum\limits_{\substack{\mathsf{\vphantom{fg}A}}} p(\mathsf{\vphantom{fg}A}, \mathsf{\vphantom{fg}B}) = p(\mathsf{\vphantom{fg}B} \mid \mathsf{\vphantom{fg}A}) \mathbin{\underset{\substack{\mathsf{\vphantom{fg}A}}}{\vphantom{fg}\odot}} p(\mathsf{\vphantom{fg}A}) && \text{marginalization} \\
p(\mathsf{\vphantom{fg}A} \mid \mathsf{\vphantom{fg}B}) &= \frac{p(\mathsf{\vphantom{fg}A}, \mathsf{\vphantom{fg}B})}{p(\mathsf{\vphantom{fg}B})} = \frac{p(\mathsf{\vphantom{fg}B} \mid \mathsf{\vphantom{fg}A}) \odot p(\mathsf{\vphantom{fg}A})}{p(\mathsf{\vphantom{fg}B} \mid \mathsf{\vphantom{fg}A}) \mathbin{\underset{\substack{\mathsf{\vphantom{fg}A}}}{\vphantom{fg}\odot}} p(\mathsf{\vphantom{fg}A})}. && \text{Bayes' rule}
\end{aligned}

In [19]:
# chain rule
pA_and_B = pB_given_A * A
pA_and_B

Tensor(tensor([[0.1200, 0.1800, 0.3000],
        [0.3200, 0.0400, 0.0400]]), OrderedDict([('A', Bint[2, ]), ('B', Bint[3, ])]), 'real')

In [20]:
# marginalization
B = pA_and_B.reduce(ops.add, "A")
B

Tensor(tensor([0.4400, 0.2200, 0.3400]), OrderedDict([('B', Bint[3, ])]), 'real')

In [21]:
# Bayes' rule
pA_given_B = pA_and_B / B
pA_given_B

Tensor(tensor([[0.2727, 0.8182, 0.8824],
        [0.7273, 0.1818, 0.1176]]), OrderedDict([('A', Bint[2, ]), ('B', Bint[3, ])]), 'real')

### Continuous bag of words

In [114]:
X = Tensor(torch.tensor([[0, 1, 1, 0],
                         [1, 0, 0, 0],
                         [0, 0, 0, 1]]),
           dtype=1)["vocab", "seq"]
X.reduce(ops.add, "vocab")

Tensor(tensor([1, 1, 1, 1]), OrderedDict([('seq', Bint[4, ])]), 1)

In [116]:
E = random_tensor(
    OrderedDict([
        ("vocab", Bint[3]),
        ("emb", Bint[4]),
    ]),
)
W = random_tensor(
    OrderedDict([
        ("classes", Bint[5]),
        ("emb", Bint[4]),
    ]),
)

@make_funsor
def CBOW(
    X: Has[{"vocab", "seq"}],
    E: Has[{"emb", "vocab"}],
    W: Has[{"emb", "classes"}],
    emb: Bound,
    vocab: Bound,
    seq: Bound,
    classes: Bound,
    classes2: Fresh[lambda classes: classes]
) -> Fresh[lambda X: X]:
    y = ((W * E).reduce(ops.add, emb) * X).reduce(ops.add, vocab).reduce(ops.add, seq)
    return Softmax(y, classes, classes2)

CBOW(X, E, W, "emb", "vocab", "seq", "classes", "classes2")

Tensor(tensor([7.2377e-01, 9.4771e-05, 2.4277e-01, 2.4577e-02, 8.7923e-03]), OrderedDict([('classes2', Bint[5, ])]), 'real')

### Sudoku ILP

### K-means clustering

In [64]:
X = random_tensor(OrderedDict([("batch", Bint[10]), ("d", Bint[4])]))
C = random_tensor(OrderedDict([("clusters", Bint[3]), ("d", Bint[4])]))

@make_funsor
def Norm(
    X: Has[{"d"}],
    d: Bound
) -> Fresh[lambda X: X]:
    return (X**2).reduce(ops.add, d).sqrt()

In [72]:
ops.argmax(funsor.terms.Lambda(Variable("d", Bint[4]), X), 0)

AssertionError: 

In [66]:
# TODO implement reduce for ops.argmin and ops.argmax
@make_funsor
def Q(
    C: Has[{"clusters", "d"}],
    X: Has[{"d"}],
    d: Bound,
    clusters: Bound
) -> Fresh[lambda X: X]:
    y = Norm(C - X, d)
    return ops.argmin(funsor.terms.Lambda(clusters, y), 0)

#Q = Norm(C - X, "d").reduce(ops.argmin, "clusters")

In [67]:
Q(C, X, "d", "clusters")

AssertionError: 

In [None]:
(Q * X / Q).reduce(ops.add, "batch") # ???

### Beam search

### Multivariate normal distribution

In [15]:
@make_funsor
def Determinant(
    F: Funsor,
    ax1: Bound,
    ax2: Bound
) -> Fresh[lambda F: F]:
    assert ax1.output.size == ax2.output.size
    m = ax1.output.size
    if m == 1:
        return F(ax1=0, ax2=0)
    else:
        result = Number(0.0) # FIX ME
        for i in range(m):
            I1 = Tensor(torch.tensor([k for k in range(1, m)]), dtype=m)[ax1.name]
            I2 = Tensor(torch.tensor([k for k in range(m) if k != i]), dtype=m)[ax2.name]
            result += F(ax1=0, ax2=i) * Determinant(F(ax1=I1, ax2=I2))
        return result

In [16]:
X = random_tensor(OrderedDict([("ax1", Bint[2]), ("ax2", Bint[2])]))
Determinant(X, "ax1", "ax2")

AssertionError: Must provide exactly one type per subexpression

# Distributions

In [30]:
alpha = Tensor(torch.tensor([2., 1.]))["i"]
beta = Tensor(torch.tensor([1., 3.]))["i"]
value = Tensor(torch.tensor([4., 5.]))["i"]

In [31]:
dist.Gamma(alpha, beta)

Gamma(concentration=Tensor(tensor([2., 1.]), OrderedDict([('i', Bint[2, ])])), rate=Tensor(tensor([1., 3.]), OrderedDict([('i', Bint[2, ])])), value=value)

In [33]:
# log_prob
dist.Gamma(alpha, beta, value)

Tensor(tensor([ -2.6137, -13.9014]), OrderedDict([('i', Bint[2, ])]), 'real')

In [37]:
dist.Gamma(alpha, beta).sample(frozenset("a"))

KeyError: 'a'

# Extra

In [22]:
# dynamic programming
@make_funsor
def Fibbonaci(
    k: Value[int]
) -> Fresh[Real]:
    if k == 0:
        return Number(0.0)
    if k == 1:
        return Number(1.0)
    return Fibbonaci(k - 1) + Fibbonaci(k - 2)

with memoize():
    y = Fibbonaci(20)
y

Number(6765.0)

In [23]:
Fibbonaci(20)

Number(6765.0)