**Goal: Variational Inference on BNP Mixture of Projected Gammas**

In [None]:
import silence_tensorflow.auto
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
from tensorflow_probability import bijectors as tfb
from numpy.random import gamma

from tfprojgamma import ProjectedGamma
from data import Data
# Set random seeds for reproducibility
np.random.seed(1)
tf.random.set_seed(1)

In [2]:
def stickbreak(v):
    """
    Creates a probability vector (sums to 1) from the vector of independent betas
    grabbed from https://luiarthur.github.io/TuringBnpBenchmarks/dpsbgmm 
        kind of worried about numerical stability here... isn't cumprod going to induce a lot of 
        floating point issues?
    """
    batch_ndims = len(v.shape) - 1
    cumprod_one_minus_v = tf.math.cumprod(1 - v, axis=-1)
    one_v = tf.pad(v, [[0, 0]] * batch_ndims + [[0, 1]], "CONSTANT", constant_values=1)
    c_one = tf.pad(cumprod_one_minus_v, [[0, 0]] * batch_ndims + [[1, 0]], "CONSTANT", constant_values=1)
    return one_v * c_one

**Joint Distribution**
$$
\begin{aligned}
y_i\mid\alpha_i &\sim \text{PG}_p(Y_i\mid\alpha_i,1_d)\\
\log\alpha_i &\sim \mathcal{G}\\
G &\sim \mathcal{PY}(\eta,d,G_0)\\
\end{aligned}
~\hspace{1cm}
\begin{aligned}
G_0 &= \mathcal{N}_d\left(\log\alpha\mid\mu,\Sigma\right)\\
\mu &\sim \mathcal{N}_d(\mu_0,I_d)\\
\Sigma &\sim \mathcal{IW}\left(\nu,\Psi\right)\\
\end{aligned}
$$

Currently building on Dirichlet process with stick-breaking outlined in Ishawaran and James 2001; will incorporate Pitman Yor later.  Where $\pi_j$ is the prior probability of falling into cluster $j$ and $J$ is the stick-breaking truncation point (maximum number of clusters),
$$
\begin{aligned}
    \tau_j &\sim \text{Beta}\left(1, \eta\right) j = 1,\ldots,J-1\\
    \pi_j &= \begin{cases}
        \prod_{k \leq j}(1 - \tau_{k}) \times \tau_j &\text{ for }j = 1,\ldots,J-1\\
        \prod_{k = 1}^{J-1}(1 - \tau_k) &\text{ for }j = J\\
        \end{cases}\\
\end{aligned}
$$

In [55]:
def generative_model(nDat, nCol, nClust, df, conc = 2., rate = 10., dtype = np.float32):
    return tfd.JointDistributionNamed(dict(
        # Hierarchical Mean
        mu = tfd.MultivariateNormalDiag(loc = np.zeros(nCol, dtype), scale_diag = np.ones(nCol, dtype)),
        # Hierarchical Covariance
        Sigma = tfd.TransformedDistribution(
            tfd.WishartTriL(df = df, scale_tril = np.linalg.cholesky(df * np.eye(nCol, dtype = dtype))),
            tfb.CholeskyToInvCholesky(),
            ),
        # DP concentration Parameter
        eta = tfd.Gamma(concentration = dtype(conc), rate = dtype(rate)),
        # DP Stickbreaking Construction
        tau = lambda eta: tfd.Independent(
            tfd.Beta(
                concentration0 = np.ones(nClust - 1, dtype), 
                concentration1 = eta[..., tf.newaxis],
                )
            ),
        # log-shape parameters for Projected Gamma
        logalpha = lambda mu, Sigma: tfd.Independent(
            tfd.MultivariateNormalTriL(loc = mu * np.ones((nClust, 1)), scale_tril = Sigma),
            ),
        # Likelihood
        obs = lambda logalpha, pi: tfd.Sample(
            tfd.MixtureSameFamily(
                mixture_distribution = tfd.Categorical(probs = stickbreak(tau)),
                components_distribution = ProjectedGamma(tf.exp(logalpha)),
                ),
            sample_shape = nDat,
            ),
        ))

**Import data and pre-process**

In [29]:
raw = pd.read_csv('../datasets/ivt_nov_mar.csv').to_numpy()
data = Data(raw, real_vars = list(np.arange(raw.shape[1])), decluster = True)
nClust = 100; df = 40

In [40]:
joint_model = generative_model(data.nDat, data.nCol, nClust, df = df)

**Gaussian Variational Bayes -- Dependence Between Columns**
$$
\begin{aligned}
\mu &\sim \mathcal{N}\left(\mu_{q\mu}, L_{q\mu}L_{q\mu}^T\right)\\
L_{\sigma} &\sim \mathcal{IW}\left(\nu_{q\Sigma},\Psi_{q\Sigma}\right)\\
\log\alpha &\sim \mathcal{N}_d

\end{aligned}
~ \hspace{1cm}
\begin{aligned} 

\end{aligned}
$$

**Variational Parameters**
$$
\begin{aligned}
\mu &\sim \mathcal{N}_d\left(\mu\mid\mu_{\mu},\Sigma_{\mu}\right)\\
\log\alpha_j &\sim \mathcal{N}_d\left(\mu,\Sigma)\\
\end{aligned}
~\hspace{1cm}~
\begin{aligned}
\tau_j &\sim \mathcal{LGN}\left(\mu,\Sigma \right)
\end{aligned}
$$

In [None]:
qMuMu   = tf.Variable(tf.random.normal(nCol, dtype = tf.float32), name = 'qMuMu')
cholbijector = tfb.FillScaleTriL(diag_bijector = tfb.Exp())
qMuLu   = tfp.util.TransformedVariable(tf.eye(nCol), bijector = cholbijector, name = 'qMuLu')
qSigmaPsiL = tfp.util.TransformedVariable(tf.eye(nCol), bijector = cholbijector, name = 'qSigmaPsiL')
qSigmaNu = tfp.util.TransformedVariable(1., bijector = tfb.Exp(), name = 'qSigmaNu')
qLogAlphaMu  = tf.Variable(tf.random.normal((nClust, nCol), name = 'qLogAlpha'))
qLogAlphaRho = tf.Variable(tf.random.normal((nClust, nCol), name = 'qLogAlphaRho'))
qTauLoc  = tf.Variable(tf.random.normal(nClust - 1), name = 'qTauLoc')
qTauRho  = tf.variable(tf.random.normal(nClust - 1), name = 'qTauRho')
qEtaLoc = tf.Variable(tf.random.normal([]), name = 'qEtaLoc')
qEtaRho = tf.Variable(tf.random.normal([]), name = 'qEtaRho')

In [None]:
surrogate_posterior = tfd.JointDistributionNamed(dict(
    # Hierarchical mean
    mu       = tfd.MultivariateNormalTriL(loc = qMuMu, scale_tril = qMuLu),
    # Hierarchical Covariance
    Sigma    = tfd.TransformedDistribution(
        tfd.WishartTriL(df = qSigmaNu, scale_tril = qSigmaPsiL),
        tfb.CholeskyToInvCholesky(),
        ),
    # Cluster Means
    logAlpha = tfd.Independent(
        tfd.MultivariateNormalTriL(loc = qLogAlphaMu, scale_tril = qLogAlphaRho),
        reinterpreted_batch_ndims = 1,
        ),
    tau = tfd.Independent(
        tfd.LogitNormal(qTauLoc, tf.nn.softplus(qTauRho)), 
        reinterpreted_batch_ndims = 1,
        ),
    eta = tfd.LogNormal(qEtaLoc, tf.nn.softplus(qEtaRho)),
    ))


$$
\log\alpha \sim \text{MVNormal}(\log\alpha \mid \mu_q, \Sigma_q)
$$

In [None]:
# New Style: Make the variational Parameters
q_nu = tf.Variable(tf.zeros(5, dtype = tf.float32), name = 'Mu Surrogate (mean of log alpha)')
cholbijector = tfb.FillScaleTriL(diag_bijector = tfb.Exp())
q_Lu = tfp.util.TransformedVariable(tf.eye(5), bijector = cholbijector)

surrogate_posterior_mvnorm = tfd.MultivariateNormalTriL(loc = q_nu, scale_tril = q_Lu)

with tf.GradientTape() as g:
    samples = surrogate_posterior_mvnorm.sample(100)
    neg_elbo = -tf.reduce_mean(model_joint_log_prob(samples) - surrogate_posterior_mvnorm.log_prob(samples))
print(g.gradient(neg_elbo, surrogate_posterior_mvnorm.trainable_variables)) # Exists!

In [None]:
path_mvnorm = tfp.vi.fit_surrogate_posterior(
    target_log_prob_fn = model_joint_log_prob,
    surrogate_posterior = surrogate_posterior_mvnorm,
    optimizer = tf.optimizers.Adam(.2),
    num_steps = 1000,
    sample_size = 500,
    )
print(tf.exp(q_nu)) # This gives the same basic response as previous.

In [None]:
(q_Lu.numpy() @ q_Lu.numpy().T < 0)

I guess it makes some sense that the posterior covariance between parameters of the Dirichlet would be positive, despite the covariance between *values* of the Dirichlet being negative.