# Tracking Sensor Bias

We want to compute the joint posterior over sensors' biases in a 2-D tracking setting.

In [71]:
from collections import OrderedDict

import torch
from torch.optim import Adam

import pyro
import pyro.distributions as dist

import funsor
import funsor.pyro
import funsor.distributions as f_dist
import funsor.ops as ops
from funsor.pyro.convert import dist_to_funsor, mvn_to_funsor, matrix_and_mvn_to_funsor, tensor_to_funsor
from funsor.interpreter import interpretation, reinterpret
from funsor.optimizer import apply_optimizer
from funsor.terms import lazy, eager_or_die
from funsor.domains import bint, reals
from funsor.torch import Tensor, Variable
from funsor.sum_product import sequential_sum_product

import matplotlib.pyplot as plt

Simulate some synthetic data:

In [69]:
num_sensors = 5
num_frames = 100

# simulate biased sensors
sensors  = []
for _ in range(num_sensors):
    bias = 0.5 * torch.randn(2)
    sensors.append(bias)

# simulate a single track
# TODO heterogeneous time
track = []
z = 10 * torch.rand(2)  # initial state
v = 2 * torch.randn(2)  # velocity
for t in range(num_frames):
    # Advance latent state.
    z += v + 0.1 * torch.randn(2)
#     z.clamp_(min=0, max=10)  # keep in the box
    
    # Observe via a random sensor.
    sensor_id = pyro.sample('id', dist.Categorical(torch.ones(num_sensors)))
    x = z - sensors[sensor_id]
    track.append({"sensor_id": sensor_id, "x": x})
    
# simulate all tracks
full_observations = []
z = 10 * torch.rand(5, 2)  # initial state
v = 2 * torch.randn(5, 2)  # velocity
for t in range(num_frames):
    # Advance latent state.
    z += v + 0.1 * torch.randn(5, 2)
#     z.clamp_(min=0, max=10)  # keep in the box
    
    # Observe via a random sensor.
    x = z - torch.stack(sensors)
    full_observations.append(x)
full_observations = torch.stack(full_observations)
assert full_observations.shape == (num_frames, 5, 2)
full_observations = Tensor(full_observations, OrderedDict([("time", bint(num_frames))]))

Now let's set up a tracking problem in Funsor. We start by modeling the biases of each sensor.

In [70]:
# TODO transform this to cholesky decomposition
# print(bias_cov.shape)
# bias_cov = bias_cov @ bias_cov.t()
# create a joint Gaussian over biases

# covs = [torch.eye(2, requires_grad=True) for i in range(num_sensors)]
# bias_dist = 0.
# for i in range(num_sensors):
#     bias += funsor.pyro.convert.mvn_to_funsor(
#         dist.MultivariateNormal(torch.zeros(2), covs[i]),
# #         event_dims=("pos",),
# #         real_inputs=OrderedDict([("bias_{}".format(i), reals(2))])
#         real_inputs=OrderedDict([("bias", reals(2))])
#     )(value="bias_{}".format(i))
# bias_dist.__dict__

# we can't write bias_dist as a sum of mvns because affine transformation
# of mvns is not supported yet.  instead we will combine all the sensors
# into a giant tensor
bias_scales = torch.ones(2, requires_grad=True)  # This can be learned
bias_dist = funsor.pyro.convert.mvn_to_funsor(
    dist.MultivariateNormal(
        torch.zeros(num_sensors * 2),
        bias_scales.expand(num_sensors, 2).reshape(-1).diag_embed()
    ),
    real_inputs=OrderedDict([("bias", reals(num_sensors, 2))])
)
bias_dist.__dict__

{'inputs': OrderedDict([('bias', reals(5, 2))]),
 'output': reals(),
 'fresh': frozenset(),
 'bound': frozenset(),
 'deltas': (),
 'discrete': Tensor(-9.189385414123535, OrderedDict(), 'real'),
 'gaussian': Gaussian(..., ((bias, reals(5, 2)),)),
 '_ast_values': ((),
  Tensor(-9.189385414123535, OrderedDict(), 'real'),
  Gaussian(..., ((bias, reals(5, 2)),)))}

Set up the filter in funsor.

In [57]:
%pdb off

Automatic pdb calling has been turned OFF


In [58]:
from pdb import set_trace as bb

In [73]:
# TODO
# this can be parameterized by a lower dimensional vector 
# to learn a structured transition matrix
# eg a GP with a matern v=3/2 kernel
# see paper for details 

# transition matrix from discretization as in 
# http://webee.technion.ac.il/people/shimkin/Estimation09/ch8_target.pdf
T = 1.  # timestep
trans_matrix_noise = torch.randn(1, requires_grad=True)
trans_dist_cov = torch.tensor([[1./3 * T ** 3, 0.5 * T ** 2],
                                  [0.5 * T ** 2, T]]) * trans_matrix_noise ** 2
transition_matrix = torch.randn(2, 2, requires_grad=True)
transition_matrix = torch.tensor([[1., T],
                                  [0, 1]])

def model(track):
    init_dist = torch.distributions.MultivariateNormal(torch.zeros(2), torch.eye(2))

    transition_dist = torch.distributions.MultivariateNormal(
        torch.zeros(2), trans_dist_cov)
    observation_matrix = torch.eye(2) + 0.2 * torch.randn(2, 2)
    sensor_ids =torch.tensor([frame["sensor_id"] for frame in track])
    biases = torch.zeros(num_sensors, 2, requires_grad=True)
    bias = biases[sensor_ids]
    observation_dist = torch.distributions.MultivariateNormal(
        bias,
        torch.eye(2))

    init = dist_to_funsor(init_dist)(value="state")
    # inputs are the previous state ``state`` and the next state
    trans = matrix_and_mvn_to_funsor(transition_matrix, transition_dist,
                                     ("time",), "state", "state(time=1)")
    obs = matrix_and_mvn_to_funsor(observation_matrix, observation_dist,
                                   ("time",), "state(time=1)", "value")
    
    # Now this is the crux, we add bias to the observation as a global variable
    # single interleaved track
    sensor_ids = Tensor(
        torch.tensor([frame["sensor_id"] for frame in track]),
        OrderedDict([("time", bint(num_frames))]),
        dtype=len(sensors)
    )
#     biased_observations = Tensor(
#         torch.stack([frame["x"] for frame in track]),
#         OrderedDict([("time", bint(num_frames))])
#     )
    
    # incorporate sensor id in the observation by creating
    # a free variable that has the signature
    # inputs: bias of shape (num_sensors, 2), sensor_ids
    # outputs shape 2
    bias = Variable("bias", reals(num_sensors, 2))#  [sensor_ids]
    debiased_observations = track - bias
#     debiased_observations = biased_observations
    # this indexing pattern is not implemented to sub into a Gaussian
    # https://github.com/pyro-ppl/funsor/pull/220
    # instead, we can use matrix_and_mvn_to_funsor and index  the proper latents and just 
    # observe naively
    
#     bias = Variable("bias", reals(num_sensors, 2))
#     debiased_observations = all_tracks - bias
    obs = obs(value=debiased_observations)
#     print(obs)
    
    logp = trans + obs + bias_dist

    bb()
    # collapse out the time variable
    # TODO this can only handle homogeneous funsor types
    logp = sequential_sum_product(ops.logaddexp, ops.add,
                                  logp, "time", {"state": "state(time=1)"})
    logp += init
    # logaddexp across all states
    logp = logp.reduce(ops.logaddexp, frozenset(["state", "state(time=1)"]))
#     # ensure we collapsed out the right dim
#     assert logp.data.dim() == 0
    return logp

In [64]:
all_tracks.data.shape

torch.Size([5, 100, 2])

## Inference

Finally we have a result that is a joint Gaussian over the biases.
We can
1. optimize all parameters to maximize `result`
2. estimate the joint distribution over all bias parameters.

In [66]:
num_epochs = 200
params = [bias_scales]
# params.append(transition_matrix)
optim = Adam(params, lr=1e-3)
for i in range(num_epochs):
    optim.zero_grad()
    with interpretation(lazy):
        log_prob = apply_optimizer(model(track))
    loss = -reinterpret(log_prob).data
    loss.backward()
    if i % 10 == 0:
        print(loss)
    optim.step()
print(params)

> <ipython-input-63-7e95554d81ab>(72)model()
-> logp = sequential_sum_product(ops.logaddexp, ops.add,
(Pdb) l
 67  	    logp = trans + obs + bias_dist
 68  	
 69  	    bb()
 70  	    # collapse out the time variable
 71  	    # TODO this can only handle homogeneous funsor types
 72  ->	    logp = sequential_sum_product(ops.logaddexp, ops.add,
 73  	                                  logp, "time", {"state": "state(time=1)"})
 74  	    logp += init
 75  	    # logaddexp across all states
 76  	    logp = logp.reduce(ops.logaddexp, frozenset(["state", "state(time=1)"]))
 77  	#     # ensure we collapsed out the right dim
(Pdb) c


ValueError: Output mismatch: bint(13) vs bint(12)

Visualize the joint posterior distribution.

### possible plots
1. plot the MSE of the MAP estimates with and without bias (or table)
2. train with and without marginalizing out bias, plot both loss curves
  - plot nll and MSE at each epoch
3. smoothing? would require adjoint algorithm `tests/test_adjoint.py`
4. 