In [None]:
from functools import partial
import torch
from torch import tensor
from torch.autograd import Variable, gradcheck

import pyro
from pyro import condition, do, infer, sample
from pyro.distributions import Categorical, Delta, Normal
from pyro.infer import EmpiricalMarginal, Importance

from matplotlib import pyplot as plt
%matplotlib inline

# Preamble
Tools for increase readability and saving space.

In [None]:
# Use importance sampling with 5000 samples for inference

def sample_f(name, dist):
    """Convert a sampled integer to a float
    """
    return tensor(float(sample(name, dist)))

def hist(marginal, name, rng=(-20, 60)):
    plt.hist([marginal() for _ in range(5000)], range=rng)
    plt.title("Marginal Histogram of {}".format(name))
    plt.xlabel("weight")
    plt.ylabel("#")
    
def infer_dist(prog, n_dist):
    """Obtain the unique distribution entailed by a SCM.
    Simple importance sampling with 5000 samples is used.
    `prog`: the subroutine encoding the SCM.
    `n_dist`: a dictionary containing distributions for each
    noise object.
    """
    return Importance(prog, num_samples=5000).run(n_dist)

# Structural Causal Models (SCMs)

Consider a simple model with a typically unobserved latent class $Z$, and two observed features, with the following causal DAG representation:

![z_x1_x2_dag](figs/z_x1_x2.png)

Define a **SCM** on $Z$, $X_1$, and $X_2$ called $\mathfrak{C}$.  $\mathfrak{C}$ consists of 3 **structural assignments** -- **subroutines** that map the values of **noise variables** ($N_Z$, $N_{X_1}$, and $N_{X_2}$) and **direct causes** to those of **direct effects**.  

We assume the following 3 structural assignments (using formal notation):

$$
\mathfrak{C}: \left\{\begin{matrix}
Z:= &N_{Z} \\ 
X_1:= & Z^2 + N_{X_1}\\ 
X_2:= &2*X_1 + Z + N_{X_2}
\end{matrix}\right.
$$

### $\mathfrak{C}$ as a subroutine

In [None]:
def fz(Nz):
    return sample('Z', Delta(Nz))

def fx1(Z, Nx1):
    return sample('X1', Delta(Z*Z + Nx1))

def fx2(X1, Z, Nx2):
    return sample('X2', Delta(2*X1 + Z + Nx2))

def scm_C(noise_dists):
    # Exogenous noise
    Nz = sample_f('Nz', noise_dists['Nz'])
    Nx1 = sample_f('Nx1', noise_dists['Nx1'])
    Nx2 = sample_f('Nx2', noise_dists['Nx2'])
    # Structural assignment of endogenous variables
    Z = fz(Nz)
    X1 = fx1(Z, Nx1)
    X2 = fx2(X1, Z, Nx2)
    return Z, X1, X2

In [None]:
def scm(noise_dists):
    Nz = tensor(float(sample('Nz', noise_dists['Nz'])))
    Nx1 = tensor(float(sample('Nx1', noise_dists['Nx1'])))
    Nx2 = tensor(float(sample('Nx2', noise_dists['Nx2'])))
    Z = sample('Z', Delta(Nz))
    X1 = sample('X1', Delta(Z*Z + Nx1))
    X2 = sample('X2', Delta(2*X1 + Z + Nx2))
    return Z, X1, X2

In [None]:
p = Variable(torch.Tensor([0.1]*10), requires_grad=True)
n_prior = {n: Categorical(p) for n in ['Nz', 'Nx1', 'Nx2']}

Query: What is the marginal distribution of $X_2$?

In [None]:
scm_dist = infer_dist(scm, n_prior)
z_marginal = EmpiricalMarginal(scm_dist, sites='X2')

In [None]:
hist(z_marginal, "X2")

Counterfactual query: Observe $X_1$ is 2, $X_2$ is 4.  What would $X_2$ have been if $X_1$ had instead been 8?


3. Pass in updated noise marginals to modified program.

1. Condition on evidence $X_1$ is 2, $X_2$ is 4, and infer the posterior marginals of the noise terms.

In [None]:
evidence = {'X1': tensor(2.), 'X2': tensor(4.)}
scm_obs = condition(scm, data=evidence)
scm_obs_dist = infer_dist(scm_obs, n_prior)
noise_marginals = {
    n: EmpiricalMarginal(scm_obs_dist, sites=n)
    for n in ['Nz', 'Nx1', 'Nx2']
}

2. Modify the initial SCM program with the do-operation, setting $X_1$ to 8

In [None]:
action = {'X1': tensor(8.)}
scm_do = do(scm, data=action)
scm_do_dist = infer_dist(scm_do, noise_marginals)
x2_cf_marginal = EmpiricalMarginal(scm_do_dist, sites = 'X2')


In [None]:
print([x2_cf_marginal() for _ in range(3)])
hist(x2_cf_marginal, "X2")

# Formalization

A **structual causal model**  $\mathfrak{C}:=(S, P_N)$ consists of a collection $S$ of d **structural assignments**

* $X_j := f_j(PA_j, N_j)$ $j = 1, ..., d$ where $PA_j \subseteq {X_1, ..., X_d} \not{\ X_j}$ are the **direct causes** or 'parents' of $X_j$...
* ... and $P_N = P_{N_1}, ..., P_{N_d}$ is a joint distribution over $d$ noise variables constrained to be jointly independent...
*  $f_j$ is a \textbf{structural assignment}: a deterministic subroutine that maps values of direct causes and noise variables to direct effects.
* Structural assignments \textbf{model the causal mechanism} deterministically. Stochasticity is exclusively captured by noise variables.
* The SCM $\mathfrak{C}$ defines a unique distribution (denoted $P^{\mathfrak{C}}_{\mathbb{X}}$ over variables $\mathbb{X} = X_1, ..., X_d$
