# Introduction

In [1]:
from collections import OrderedDict
import functools

import math
import torch
from torch.distributions import constraints

import funsor
from funsor.terms import Funsor, Variable, Number
from funsor.tensor import Tensor
from funsor.domains import Array, Bint, Real, Reals
from funsor.factory import Bound, Fresh, Has, Value, make_funsor, to_funsor
import funsor.ops as ops
from funsor.cnf import Contraction
from funsor.testing import random_tensor
from funsor.interpretations import reflect

funsor.set_backend("torch")
torch.set_default_dtype(torch.float32)

%load_ext mypy_ipython

# Informal Overview

## Named Tensors

In [2]:
A = Tensor(
    torch.tensor([[3., 1., 4.],
                  [1., 5., 9.],
                  [2., 6., 5.]]),
    inputs=OrderedDict([("height", Bint[3]), ("width", Bint[3])]),
    dtype="real"
)

In [3]:
A(height=0)

Tensor(tensor([3., 1., 4.]), OrderedDict([('width', Bint[3, ])]), 'real')

In [4]:
A(width=2)

Tensor(tensor([4., 9., 5.]), OrderedDict([('height', Bint[3, ])]), 'real')

## Named tensor operations

### Elementwise operations and broadcasting

In [5]:
A.sigmoid()  # 1 / (1 + (-A).exp())

Tensor(tensor([[0.9526, 0.7311, 0.9820],
        [0.7311, 0.9933, 0.9999],
        [0.8808, 0.9975, 0.9933]]), OrderedDict([('height', Bint[3, ]), ('width', Bint[3, ])]), 'real')

In [6]:
x = Tensor(
    torch.tensor([2., 7., 1.]),
    inputs=OrderedDict([("height", Bint[3])]),
    dtype="real"
)

y = Tensor(
    torch.tensor([1., 4., 1.]),
    inputs=OrderedDict([("width", Bint[3])]),
    dtype="real"
)

In [7]:
A + x

Tensor(tensor([[ 5.,  3.,  6.],
        [ 8., 12., 16.],
        [ 3.,  7.,  6.]]), OrderedDict([('height', Bint[3, ]), ('width', Bint[3, ])]), 'real')

In [8]:
A + y

Tensor(tensor([[ 4.,  5.,  5.],
        [ 2.,  9., 10.],
        [ 3., 10.,  6.]]), OrderedDict([('height', Bint[3, ]), ('width', Bint[3, ])]), 'real')

### Reductions

In [9]:
A.reduce(ops.add, reduced_vars="height")

Tensor(tensor([ 6., 12., 18.]), OrderedDict([('width', Bint[3, ])]), 'real')

In [10]:
A.reduce(ops.add, reduced_vars="width")

Tensor(tensor([ 8., 15., 13.]), OrderedDict([('height', Bint[3, ])]), 'real')

In [11]:
A.reduce(ops.add, reduced_vars={"height", "width"})

Tensor(36.0, OrderedDict(), 'real')

In [12]:
(A * y).reduce(ops.add, reduced_vars="width")

Tensor(tensor([11., 30., 31.]), OrderedDict([('height', Bint[3, ])]), 'real')

In [13]:
Contraction(
    red_op=ops.add,
    bin_op=ops.mul,
    reduced_vars=frozenset({Variable("width", output=Bint[3])}),
    terms=(A, y)
)

Tensor(tensor([11., 30., 31.]), OrderedDict([('height', Bint[3, ])]), 'real')

In [14]:
(x * x).reduce(ops.add, reduced_vars="height")  # innder product
x * y  # outer product
(A * y).reduce(ops.add, reduced_vars="width")  # matrix-vector product
# vector-matrix product is the same as matrix-vector product
(A * x).reduce(ops.add, reduced_vars="height")  # vector-matrix product

Tensor(tensor([15., 43., 76.]), OrderedDict([('width', Bint[3, ])]), 'real')

In [15]:
B = Tensor(
    torch.tensor([[3, 2, 5], [5, 4, 0], [8, 3, 6]]),
    inputs=OrderedDict([("width", Bint[3]), ("width'", Bint[3])]),
    dtype="real"
)

In [16]:
(A * B).reduce(ops.add, reduced_vars="width")  # matrix-matrix product

Tensor(tensor([[ 46.,  22.,  39.],
        [100.,  49.,  59.],
        [ 76.,  43.,  40.]]), OrderedDict([('height', Bint[3, ]), ("width'", Bint[3, ])]), 'real')

### Renaming and reshaping

In [17]:
A(**{"height": "height'"})  # A(height="height'")

Tensor(tensor([[3., 1., 4.],
        [1., 5., 9.],
        [2., 6., 5.]]), OrderedDict([("height'", Bint[3, ]), ('width', Bint[3, ])]), 'real')

# Examples

## Building blocks

### Feedforward neural networks

In [18]:
input_dim = 100
X0 = random_tensor(
    OrderedDict([("input_layer", Bint[input_dim])])
)

hidden_1_dim = 32
W1 = random_tensor(
    OrderedDict([
        ("input_layer", Bint[input_dim]),
        ("hidden_layer_1", Bint[hidden_1_dim])
    ])
)
b1 = random_tensor(
    OrderedDict([("hidden_layer_1", Bint[hidden_1_dim])])
)
X1 = ((W1 * X0).reduce(ops.add, "input_layer") + b1).sigmoid()

hidden_2_dim = 16
W2 = random_tensor(
    OrderedDict([
        ("hidden_layer_1", Bint[hidden_1_dim]),
        ("hidden_layer_2", Bint[hidden_2_dim])
    ])
)
b2 = random_tensor(
    OrderedDict([("hidden_layer_2", Bint[hidden_2_dim])])
)
X2 = ((W2 * X1).reduce(ops.add, "hidden_layer_1") + b2).sigmoid()

hidden_3_dim = 8
W3 = random_tensor(
    OrderedDict([
        ("hidden_layer_2", Bint[hidden_2_dim]),
        ("hidden_layer_3", Bint[hidden_3_dim])
    ])
)
b3 = random_tensor(
    OrderedDict([("hidden_layer_3", Bint[hidden_3_dim])])
)
X3 = ((W3 * X2).reduce(ops.add, "hidden_layer_2") + b3).sigmoid()

In [19]:
# is this correct?
@make_funsor
def FullConnLayer(
    x: Has[{"layer"}],
    W: Has[{"layer"}],
    b: Funsor,
    layer: Bound
) -> Fresh[lambda x: x]:
    result = ((W * x).reduce(ops.add, layer) + b).sigmoid()
    return result

In [20]:
input_dim = 100
X0 = random_tensor(
    OrderedDict([("layer", Bint[input_dim])])
)
hidden_1_dim = 32
W1 = random_tensor(
    OrderedDict([
        ("layer", Bint[input_dim]),
        ("out_layer", Bint[hidden_1_dim])
    ])
)
b1 = random_tensor(
    OrderedDict([("out_layer", Bint[hidden_1_dim])])
)

with reflect:
    FullConnLayer(X0, W1, b1, "layer")

X1 = FullConnLayer(X0, W1, b1, "layer")
X1

Tensor(tensor([1.4654e-03, 4.1117e-01, 1.0000e+00, 1.0000e+00, 7.1454e-01, 1.0000e+00,
        4.2027e-12, 4.1232e-06, 9.9991e-01, 1.9417e-04, 9.9754e-01, 6.8267e-12,
        2.8535e-07, 9.8524e-01, 9.9350e-01, 9.9993e-01, 1.0000e+00, 9.9926e-01,
        1.0000e+00, 9.5022e-02, 9.9571e-01, 3.1796e-02, 9.9969e-01, 4.1069e-04,
        9.9307e-01, 8.3539e-01, 3.3728e-08, 7.8285e-01, 7.5796e-01, 1.5954e-02,
        9.9961e-01, 9.9983e-01]), OrderedDict([('out_layer', Bint[32, ])]), 'real')

### Recurrent neural networks

In [None]:
@make_funsor
def RecurrentLayer(
    x: Funsor,
    Wh: Funsor,
    Wi: Funsor,
    b: Funsor,
    hidden: Bound,
    input: Bound
) -> Fresh[lambda x: x]:
    output = ((Wh * h).reduce(ops.add, "hidden") + (Wi * x).reduce(ops.add, "input") + b).sigmoid()
    return output(hidden="new_hidden")

### Attention

In [26]:
# Can Has be used here?
@make_funsor
def Softmax(
    x: Funsor,
    i: Funsor
) -> Fresh[lambda x: x]:
    return x.exp() / x.exp().reduce(ops.add, i)

In [27]:
@make_funsor
def Attention(
    Q: Has[{"key"}],
    K: Has[{"key", "seq"}],
    V: Has[{"seq"}],
    M: Has[{"seq"}], # does only one of Q, K, and M need to have "seq"?
    key: Bound,
    seq: Bound
) -> Fresh[lambda Q: Q]:
    x = (Q * K).reduce(ops.add, key) / math.sqrt(key.output.size) + M
    return (Softmax(x, seq) * V).reduce(ops.add, seq)

In [28]:
q = random_tensor(OrderedDict([("key", Bint[10])]))
k = random_tensor(OrderedDict([("key", Bint[10]), ("seq", Bint[3])]))
v = random_tensor(OrderedDict([("seq", Bint[3]), ("val", Bint[5])]))
m = random_tensor(OrderedDict([("seq", Bint[3])]))
Attention(q, k, v, m, "key", "seq")

Tensor(tensor([-1.1402,  0.1616, -0.3963, -1.2688,  0.0687]), OrderedDict([('val', Bint[5, ])]), 'real')

### Convolution

In [21]:
@make_funsor
def Unroll(
    x: Funsor,
    seq: Bound,
    kernel: Funsor
) -> Fresh[lambda x: x]:
    temp_seq = Variable("temp_seq", Bint[seq.output.size - kernel.output.size + 1])
    return x(**{seq.name: temp_seq+kernel})(**{"temp_seq": seq.name})

In [22]:
X = random_tensor(OrderedDict([("chans", Bint[3]), ("seq", Bint[10])]))
kernel = Variable("kernel", Bint[3])
Y = Unroll(X, "seq", kernel)
Y

Tensor(tensor([[[ 1.0071,  0.9979,  0.0461],
         [ 0.9979,  0.0461, -1.5424],
         [ 0.0461, -1.5424,  1.4369],
         [-1.5424,  1.4369, -0.0624],
         [ 1.4369, -0.0624,  0.6151],
         [-0.0624,  0.6151,  0.0119],
         [ 0.6151,  0.0119, -1.0266],
         [ 0.0119, -1.0266, -1.0915]],

        [[-0.4676, -1.5476,  0.0486],
         [-1.5476,  0.0486,  0.4757],
         [ 0.0486,  0.4757,  0.6420],
         [ 0.4757,  0.6420, -0.0745],
         [ 0.6420, -0.0745, -0.1132],
         [-0.0745, -0.1132, -1.9013],
         [-0.1132, -1.9013, -1.0457],
         [-1.9013, -1.0457,  1.3052]],

        [[-0.2131,  0.0280,  0.5392],
         [ 0.0280,  0.5392,  1.0034],
         [ 0.5392,  1.0034,  0.9759],
         [ 1.0034,  0.9759,  1.4469],
         [ 0.9759,  1.4469,  0.3573],
         [ 1.4469,  0.3573, -0.9284],
         [ 0.3573, -0.9284,  0.2496],
         [-0.9284,  0.2496,  0.1293]]]), OrderedDict([('chans', Bint[3, ]), ('seq', Bint[8, ]), ('kernel', Bint[3, 

In [23]:
Y(seq=4, kernel=2), X(seq=4+2) # i + j ?

(Tensor(tensor([ 0.6151, -0.1132,  0.3573]), OrderedDict([('chans', Bint[3, ])]), 'real'),
 Tensor(tensor([ 0.6151, -0.1132,  0.3573]), OrderedDict([('chans', Bint[3, ])]), 'real'))

In [24]:
@make_funsor
def Conv1d(
    X: Has[{"chans", "seq"}],
    W: Has[{"chans", "kernel"}],
    b: Funsor,
    chans: Bound,
    kernel: Bound,
    seq: Bound
) -> Fresh[lambda X: X]:
    return (W * Unroll(X, seq.name, kernel)).reduce(ops.add, frozenset({chans, kernel})) + b

In [25]:
x = random_tensor(OrderedDict([("chans", Bint[3]), ("seq", Bint[10])]))
kernel = Variable("kernel", Bint[3])
w = random_tensor(OrderedDict([("chans", Bint[3]), ("kernel", Bint[3])]))
b = random_tensor(OrderedDict([]))

In [26]:
Conv1d(x, w, b, "chans", "kernel", "seq")

Tensor(tensor([-0.1743,  2.2894,  1.8864,  2.2908,  6.7270,  3.3832,  6.4320,  1.2325]), OrderedDict([('seq', Bint[8, ])]), 'real')

In [27]:
@make_funsor
def Conv2d(
    X: Has[{"chans", "seq"}],
    W: Has[{"chans", "kernel"}],
    b: Funsor,
    chans: Bound,
    kh: Bound,
    height: Bound,
    kw: Bound,
    width: Bound
) -> Fresh[lambda X: X]:
    return (W * Unroll(Unroll(X, width.name, kw), height.name, kh)).reduce(ops.add, frozenset({chans, kh, kw})) + b

In [28]:
x = random_tensor(OrderedDict([("chans", Bint[3]), ("height", Bint[10]), ("width", Bint[8])]))
kh = Variable("kh", Bint[3])
kw = Variable("kw", Bint[4])
w = random_tensor(OrderedDict([("chans", Bint[3]), ("kh", Bint[3]), ("kw", Bint[4])]))
b = random_tensor(OrderedDict([]))

Conv2d(x, w, b, "chans", "kh", "height", "kw", "width")

Tensor(tensor([[ -5.9767, -14.7826,  -1.3400,  -6.0648,  -4.9239],
        [ -6.6478,  -4.0896, -10.8918,   0.3870,   5.9537],
        [  7.0979,  -8.1826,   3.0593,  -0.2431,  -4.0221],
        [ -1.6603,  -2.7840,  -0.5078,  -3.2974,   4.2820],
        [ -3.8931,  -5.6690,   1.3527,  -6.9459,  -4.1702],
        [ -3.8460,  -9.1965,   2.9566,  -0.9288,  -0.2515],
        [  0.2285,   4.0946, -16.1375,   1.8454, -14.1157],
        [-13.8018,  -3.6187,   1.1990,  -7.7495,   8.8234]]), OrderedDict([('height', Bint[8, ]), ('width', Bint[5, ])]), 'real')

### Max pooling

In [2]:
@make_funsor
def Unflatten(
    x: Funsor,
    i: Bound,
    i_over_2: Fresh[lambda i: Bint[i.size // 2]],
    i_mod_2: Fresh[lambda: Bint[2]],
) -> Fresh[lambda x: x]:
    assert i.output.size % 2 == 0
    return x(**{i.name: i_over_2 * Number(2, 3) + i_mod_2})

In [3]:
# Partial evaluation
X = random_tensor(OrderedDict([("seq", Bint[10])]))
X_var = Variable("x", Bint[10])
Y = Unflatten(X_var, "seq", "seq2", "kernel")
Y(x=X)

ValueError: Cannot infer domain of i=seq

In [2]:
@make_funsor
def Pool(
    x: Funsor,
    seq: Bound,
    k: Value[int],
    kernel: Fresh[lambda k: Bint[k]],
    seq2: Fresh[lambda seq, k: Bint[seq.size // k]], # seq -> Bint[]
) -> Fresh[lambda x: x]: # x -> x.output (Bint[] or Real)
    seq2 = x.materialize(seq2)
    kernel = x.materialize(kernel)
    breakpoint()
    return x(**{seq.name: seq2 * Number(k, k+1) + kernel})

In [67]:
@make_funsor
def Pool2(
    x: Funsor,
    seq: Bound,
    kernel: Funsor,
    seq2: Fresh[lambda seq, kernel: Bint[seq.size // kernel.output.size]], # seq -> Bint[]
) -> Fresh[lambda x: x]: # x -> x.output (Bint[] or Real)
    return x(**{seq.name: seq2 * Number(kernel.output.size, kernel.output.size+1) + kernel})

Bint[7, ]

In [3]:
X = random_tensor(OrderedDict([("seq", Bint[10])]))
# kernel = Variable("kernel", Bint[2])
Y = Pool(X, "seq", 2, "kernel", "seq2")
Y

Pool(Tensor(tensor([-0.1375, -0.1264, -0.8750, -0.7947, -2.1557,  0.1106,  1.0998, -0.8064,
         1.5379, -0.9541]), OrderedDict([('seq__BOUND_1', Bint[10, ])]), 'real'), Variable('seq__BOUND_1', Bint[10, ]), 2, Variable('kernel', Bint[2, ]), Variable('seq2', Bint[5, ]))

In [55]:
Y.inputs

OrderedDict([('kernel', Bint[2, ]), ('seq2', Bint[5, ])])

In [18]:
funsor.terms.Tuple(kernel, temp_seq).output

Product[Bint[2, ], Bint[5, ]]

In [32]:
((temp_seq * Number(1, 3))).output

Bint[5, ]

In [14]:
(funkernel * temp_seq).output

Bint[5, ]

In [6]:
X(seq=temp_seq+kernel)

NameError: name 'temp_seq' is not defined

In [7]:
temp_seq = Variable("temp_seq", Bint[5])
X(seq=kernel+temp_seq)

ValueError: Output mismatch: Bint[6, ] vs Bint[10, ]

### Normalization layers

In [61]:
@make_funsor
def Mean(
    X: Funsor,
    ax: Bound
) -> Fresh[lambda X: X]:
    return X.reduce(ops.add, ax) / ax.output.size

@make_funsor
def Variance(
    X: Funsor,
    ax: Bound
) -> Fresh[lambda X: X]:
    return Mean((X - Mean(X, ax))**2, ax)

@make_funsor
def Standardize(
    X: Funsor,
    ax: Bound
) -> Fresh[lambda X: X]:
    return (X - Mean(X, ax)) / (Variance(X, ax) + ops.finfo(X.data).eps)

@make_funsor
def BatchNorm(
    X: Funsor,
    gamma: Funsor,
    beta: Funsor,
    batch: Bound,
    layer: Bound
) -> Fresh[lambda X: X]:
    return Standardize(X, )

In [62]:
x = random_tensor(OrderedDict([("i", Bint[10])]))
Standardize(x, "i")

Tensor(tensor([-2.0574, -0.4460,  1.0813,  0.4329,  0.9293, -0.0551,  0.2744, -0.5563,
        -1.1598,  1.5569]), OrderedDict([('i', Bint[10, ])]), 'real')

## Transformer

## LeNet

## Other examples

### Discrete random variables

In [29]:
B_given_A = Tensor(
    torch.tensor([[0.2, 0.3, 0.5],
                  [0.8, 0.1, 0.1]]),
    inputs=OrderedDict([("A", Bint[2]), ("B", Bint[3])]),
    dtype="real"
)

A = Tensor(
    torch.tensor([0.6, 0.4]),
    inputs=OrderedDict([("A", Bint[2])]),
    dtype="real"
)

In [30]:
A_and_B = B_given_A * A
A_and_B

Tensor(tensor([[0.1200, 0.1800, 0.3000],
        [0.3200, 0.0400, 0.0400]]), OrderedDict([('A', Bint[2, ]), ('B', Bint[3, ])]), 'real')

In [31]:
B = A_and_B.reduce(ops.add, "A")
B

Tensor(tensor([0.4400, 0.2200, 0.3400]), OrderedDict([('B', Bint[3, ])]), 'real')

In [32]:
A_given_B = A_and_B / B
A_given_B

Tensor(tensor([[0.2727, 0.8182, 0.8824],
        [0.7273, 0.1818, 0.1176]]), OrderedDict([('A', Bint[2, ]), ('B', Bint[3, ])]), 'real')

### Advanced indexing

### Continuous bag of words

### Sudoku ILP

### K-means clustering

In [None]:
x = random_tensor(OrderedDict([("batch", Bint[10]), ("d", Bint[4])]))
c = random_tensor(OrderedDict([("clusters", Bint[3]), ("d", Bint[4])]))

### Beam search

### Multivariate normal distribution

# LaTeX Macros

# Formal Definitions

## Records and shapes

## Named tensors

## Named tensor operations

## Common operations

# Differentiation

## Definition

## Rules

## Example

## Broadcasting

# Alternatives