# Introduction

In [1]:
from collections import OrderedDict
import functools

import math
import torch
from torch.distributions import constraints

import funsor
from funsor.terms import Funsor, Variable
from funsor.tensor import Tensor
from funsor.domains import Array, Bint, Real, Reals
from funsor.factory import Bound, Fresh, Value, make_funsor, to_funsor
import funsor.ops as ops
from funsor.cnf import Contraction
from funsor.testing import random_tensor

from pyro import set_rng_seed as pyro_set_rng_seed
from pyro.ops.indexing import Vindex
from pyro.poutine.messenger import Messenger

funsor.set_backend("torch")
torch.set_default_dtype(torch.float32)
pyro_set_rng_seed(101)

# Informal Overview

## Named Tensors

In [4]:
A = Tensor(
    torch.tensor([[3., 1., 4.],
                  [1., 5., 9.],
                  [2., 6., 5.]]),
    inputs=OrderedDict([("height", Bint[3]), ("width", Bint[3])]),
    dtype="real"
)

In [5]:
A(height=0)

Tensor(tensor([3., 1., 4.]), OrderedDict([('width', Bint[3, ])]), 'real')

In [6]:
A(width=2)

Tensor(tensor([4., 9., 5.]), OrderedDict([('height', Bint[3, ])]), 'real')

## Named tensor operations

### Elementwise operations and broadcasting

In [5]:
1 / (1 + (-A).exp())

Tensor(tensor([[0.9526, 0.7311, 0.9820],
        [0.7311, 0.9933, 0.9999],
        [0.8808, 0.9975, 0.9933]]), OrderedDict([('height', Bint[3, ]), ('width', Bint[3, ])]), 'real')

In [95]:
A.sigmoid()

Tensor(tensor([[0.9526, 0.7311, 0.9820],
        [0.7311, 0.9933, 0.9999],
        [0.8808, 0.9975, 0.9933]]), OrderedDict([('height', Bint[3, ]), ('width', Bint[3, ])]), 'real')

In [7]:
x = Tensor(
    torch.tensor([2, 7, 1]),
    inputs=OrderedDict([("height", Bint[3])]),
    dtype="real"
)
y = Tensor(
    torch.tensor([1, 4, 1]),
    inputs=OrderedDict([("width", Bint[3])]),
    dtype="real"
)

In [8]:
A + x

Tensor(tensor([[ 5,  3,  6],
        [ 8, 12, 16],
        [ 3,  7,  6]]), OrderedDict([('height', Bint[3, ]), ('width', Bint[3, ])]), 'real')

In [9]:
A + y

Tensor(tensor([[ 4,  5,  5],
        [ 2,  9, 10],
        [ 3, 10,  6]]), OrderedDict([('height', Bint[3, ]), ('width', Bint[3, ])]), 'real')

### Reductions

In [13]:
A.mean()

NotImplementedError: 

In [14]:
A.dtype

'real'

In [12]:
funsor.terms.Reduce(ops.var, A, frozenset({Variable("height", Bint[3])}))

> [0;32m/home/ordabayev/repos/funsor/funsor/tensor.py[0m(309)[0;36meager_reduce[0;34m()[0m
[0;32m    307 [0;31m    [0;32mdef[0m [0meager_reduce[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mop[0m[0;34m,[0m [0mreduced_vars[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    308 [0;31m        [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 309 [0;31m        [0;32mif[0m [0mop[0m [0;32min[0m [0mREDUCE_OP_TO_NUMERIC[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    310 [0;31m            [0mnumeric_op[0m [0;34m=[0m [0mREDUCE_OP_TO_NUMERIC[0m[0;34m[[0m[0mop[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    311 [0;31m            [0;32massert[0m [0misinstance[0m[0;34m([0m[0mreduced_vars[0m[0;34m,[0m [0mfrozenset[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> c


Tensor(tensor([0.6667, 4.6667, 4.6667]), OrderedDict([('width', Bint[3, ])]), 'real')

In [6]:
A.reduce(ops.mean, reduced_vars="height")

AssertionError: 

In [13]:
A.reduce(ops.add, reduced_vars="width")

Tensor(tensor([ 8, 15, 13]), OrderedDict([('height', Bint[3, ])]), 'real')

In [14]:
A.reduce(ops.add, reduced_vars={"height", "width"})

Tensor(36, OrderedDict(), 'real')

In [15]:
(A * y).reduce(ops.add, reduced_vars="width")

Tensor(tensor([11, 30, 31]), OrderedDict([('height', Bint[3, ])]), 'real')

In [29]:
Contraction(
    red_op=ops.add,
    bin_op=ops.mul,
    reduced_vars=frozenset({Variable("width", output=Bint[3])}),
    terms=(A, y)
)

Tensor(tensor([11, 30, 31]), OrderedDict([('height', Bint[3, ])]), 'real')

In [30]:
(x * x).reduce(ops.add, reduced_vars="height")  # innder product
x * y  # outer product
(A * y).reduce(ops.add, reduced_vars="width")  # matrix-vector product
# vector-matrix product is the same as matrix-vector product
(A * x).reduce(ops.add, reduced_vars="height")  # vector-matrix product

Tensor(54, OrderedDict(), 'real')

In [45]:
B = Tensor(
    torch.tensor([[3, 2, 5], [5, 4, 0], [8, 3, 6]]),
    inputs=OrderedDict([("width", Bint[3]), ("width'", Bint[3])]),
    dtype="real"
)

In [46]:
(A * B).reduce(ops.add, reduced_vars="width")  # matrix-matrix product

Tensor(tensor([[ 46,  22,  39],
        [100,  49,  59],
        [ 76,  43,  40]]), OrderedDict([('height', Bint[3, ]), ("width'", Bint[3, ])]), 'real')

### Renaming and reshaping

In [44]:
A(**{"height": "height'"})

Tensor(tensor([[3, 1, 4],
        [1, 5, 9],
        [2, 6, 5]]), OrderedDict([("height'", Bint[3, ]), ('width', Bint[3, ])]), 'real')

In [43]:
A(height="height'")

Tensor(tensor([[3, 1, 4],
        [1, 5, 9],
        [2, 6, 5]]), OrderedDict([("height'", Bint[3, ]), ('width', Bint[3, ])]), 'real')

# Examples

## Building blocks

### Feedforward neural networks

In [59]:
input_dim = 100
X0 = random_tensor(
    OrderedDict([("input_layer", Bint[input_dim])])
)

hidden_1_dim = 32
W1 = random_tensor(
    OrderedDict([
        ("input_layer", Bint[input_dim]),
        ("hidden_layer_1", Bint[hidden_1_dim])
    ])
)
b1 = random_tensor(
    OrderedDict([("hidden_layer_1", Bint[hidden_1_dim])])
)
X1 = ((W1 * X0).reduce(ops.add, "input_layer") + b1).sigmoid()

hidden_2_dim = 16
W2 = random_tensor(
    OrderedDict([
        ("hidden_layer_1", Bint[hidden_1_dim]),
        ("hidden_layer_2", Bint[hidden_2_dim])
    ])
)
b2 = random_tensor(
    OrderedDict([("hidden_layer_2", Bint[hidden_2_dim])])
)
X2 = ((W2 * X1).reduce(ops.add, "hidden_layer_1") + b2).sigmoid()

hidden_3_dim = 8
W3 = random_tensor(
    OrderedDict([
        ("hidden_layer_2", Bint[hidden_2_dim]),
        ("hidden_layer_3", Bint[hidden_3_dim])
    ])
)
b3 = random_tensor(
    OrderedDict([("hidden_layer_3", Bint[hidden_3_dim])])
)
X3 = ((W3 * X2).reduce(ops.add, "hidden_layer_2") + b3).sigmoid()

In [96]:
@make_funsor
def FullConn(
    x: Funsor,
    W: Funsor,
    b: Funsor,
    out_layer: Bound
) -> Fresh[lambda x: x]:
    result = ((W * x).reduce(ops.add, "layer") + b).sigmoid()
    return result(out_layer="layer")

In [98]:
input_dim = 100
X0 = random_tensor(
    OrderedDict([("layer", Bint[input_dim])])
)
hidden_1_dim = 32
W1 = random_tensor(
    OrderedDict([
        ("layer", Bint[input_dim]),
        ("out_layer", Bint[hidden_1_dim])
    ])
)
b1 = random_tensor(
    OrderedDict([("out_layer", Bint[hidden_1_dim])])
)

X1 = FullConn(X0, W1, b1, "out_layer")
X1

Tensor(tensor([1.0000e+00, 3.6789e-08, 2.7427e-03, 1.0000e+00, 5.0574e-02, 9.8840e-01,
        8.8957e-01, 5.6603e-02, 1.9728e-02, 1.0000e+00, 5.6834e-01, 2.2332e-04,
        9.9997e-01, 1.0000e+00, 7.0630e-01, 1.1290e-05, 9.9352e-01, 9.6474e-01,
        1.1634e-01, 1.0000e+00, 1.0000e+00, 1.0000e+00, 9.8779e-01, 1.0000e+00,
        9.8806e-01, 4.6620e-04, 5.1553e-01, 1.2920e-01, 4.2344e-03, 2.3528e-07,
        4.6444e-02, 5.6177e-01]), OrderedDict([('layer', Bint[32, ])]), 'real')

In [87]:
FullConn1

Unary(ops.sigmoid, Tensor(tensor([ 12.1907,  -9.5505,   0.7773, -10.6436,   6.1068,  -7.9064,  13.7416,
          2.8200, -10.2432,  -2.4031,   4.2102,  -7.4767,  -6.4883,   4.5402,
         -5.0474,  -5.3027, -19.0056,   5.0540, -10.7980, -10.9381,  -5.0640,
         -8.7423,  -5.3614,  16.5684,  11.0685, -10.4617,  14.0840,  -5.7486,
          5.6930,  22.1246,  -9.4797,   9.4110]), OrderedDict([('layer', Bint[32, ])]), 'real') * Variable('x', Bint[32, ]).reduce(nullop, set()) + Tensor(tensor([-0.6199, -0.5905,  0.9894,  0.2203,  1.1884, -0.1901,  1.7529, -0.8433,
         2.0776, -0.2285, -0.0892, -0.1676,  0.7253, -0.8604,  0.0875,  0.3758,
        -1.0097,  0.9317, -0.3663,  0.1698, -1.2254, -0.7038, -1.4458,  0.6572,
        -2.2138,  0.0464, -1.2918, -0.3530, -1.7813, -0.4770, -0.5504, -0.6619]), OrderedDict([('layer', Bint[32, ])]), 'real').reduce(nullop, set()))

### Recurrent neural networks

In [None]:
@make_funsor
def RecurrentLayer(
    x: Funsor,
    Wh: Funsor,
    Wi: Funsor,
    b: Funsor,
    hidden: Bound,
    input: Bound
) -> Fresh[lambda x: x]:
    output = ((Wh * h).reduce(ops.add, "hidden") + (Wi * x).reduce(ops.add, "input") + b).sigmoid()
    return output(hidden="new_hidden")

### Attention

In [128]:
@make_funsor
def Softmax(
    x: Funsor,
    i: Funsor
) -> Fresh[lambda x: x]:
    return x.exp() / x.exp().reduce(ops.add, i)

In [131]:
@make_funsor
def Attention(
    Q: Funsor,
    K: Funsor,
    V: Funsor,
    M: Funsor,
    key: Bound,
    seq: Bound
) -> Fresh[lambda Q: Q]:
    x = (Q * K).reduce(ops.add, key) / math.sqrt(key.output.size) + M
    return (Softmax(x, seq) * V).reduce(ops.add, seq)

In [132]:
q = random_tensor(OrderedDict([("key", Bint[10])]))
k = random_tensor(OrderedDict([("key", Bint[10]), ("seq", Bint[3])]))
v = random_tensor(OrderedDict([("seq", Bint[3]), ("val", Bint[5])]))
m = random_tensor(OrderedDict([("seq", Bint[3])]))
Attention(q, k, v, m, "key", "seq")

Tensor(tensor([ 0.9867,  1.1100,  0.1977, -0.2142,  0.3773]), OrderedDict([('val', Bint[5, ])]), 'real')

In [134]:
Softmax(q, Variable("key", Bint[10]))

Tensor(tensor([0.1604, 0.2158, 0.1558, 0.0590, 0.0744, 0.0630, 0.0846, 0.0741, 0.1013,
        0.0115]), OrderedDict([('key', Bint[10, ])]), 'real')

In [135]:
q.exp() / q.exp().reduce(ops.add, "key")

Tensor(tensor([0.1604, 0.2158, 0.1558, 0.0590, 0.0744, 0.0630, 0.0846, 0.0741, 0.1013,
        0.0115]), OrderedDict([('key', Bint[10, ])]), 'real')

### Convolution

In [152]:
@make_funsor
def Unroll(
    x: Funsor,
    seq: Bound,
    new_seq: Fresh[lambda seq: Bint[seq.size - 1]],
    kernel: Fresh[lambda: Bint[2]],
) -> Fresh[lambda x: x]:
    return x(**{seq.name: new_seq+kernel-1})

In [156]:
q = random_tensor(OrderedDict([("seq", Bint[10])]))
# Unroll(q, "seq", "new_seq", "kernel")
new_seq = Variable("new_seq", Bint[9])
kernel = Variable("kernel", Bint[2])
q(seq=new_seq+kernel)

Tensor(tensor([[-2.4158, -0.3009],
        [-0.3009, -0.0916],
        [-0.0916,  0.3367],
        [ 0.3367, -0.2952],
        [-0.2952, -0.2060],
        [-0.2060, -0.8539],
        [-0.8539,  1.2039],
        [ 1.2039, -1.1123],
        [-1.1123, -1.9163]]), OrderedDict([('new_seq', Bint[9, ]), ('kernel', Bint[2, ])]), 'real')

In [157]:
new_seq + kernel

Variable('new_seq', Bint[9, ]) + Variable('kernel', Bint[2, ]).reduce(nullop, set())

In [None]:
# v(seq=2, kernel=1) == q(seq=2+1-1)

### Max pooling

### Normalization layers

In [None]:
def standardize(
    X,
    ax
):
    return (X - X.reduce(ops.mean, ax)) / X.

## Transformer

## LeNet

## Other examples

### Discrete random variables

### Advanced indexing

### Continuous bag of words

### Sudoku ILP

### K-means clustering

### Beam search

### Multivariate normal distribution

# LaTeX Macros

# Formal Definitions

## Records and shapes

## Named tensors

## Named tensor operations

## Common operations

# Differentiation

## Definition

## Rules

## Example

## Broadcasting

# Alternatives