In [1]:
import numpy as np
from scipy.spatial import KDTree
import matplotlib.pyplot as plt
import scipy.linalg

Methods to calculate KL-Divergence. When the optimal cholesky factor is used, the trace factor cancels.

In [3]:
def logdet_chol(A):
    return 2 * np.sum(np.log(A.diagonal()))

def kl_div(A, B):
    n = A.shape[0]
    return 0.5 * (np.trace(np.linalg.solve(B, A)) - n + np.linalg.slogdet(B)[1] - np.linalg.slogdet(A)[1])

def sparse_kl_div(A, L):
    n = A.shape[0]
    return 0.5 * (-logdet_chol(L) - np.linalg.slogdet(A)[1])

In [4]:
%%latex
Matern kernel with $\nu = \frac{1}{2}$ and $l=1$: $$\Theta_{i,j} = -\exp(||x_i-x_j||_2)$$

<IPython.core.display.Latex object>

In [5]:
def kernel(points):
    n = points.shape[0]
    A = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            A[i, j] = np.exp(-np.linalg.norm(points[i] - points[j]))
    return A

Orderings for the Cholesky factorization by picking the maximum of the minimum distances between points.

In [7]:
def reverse_maximin(points):
    n = np.shape(points)[0]
    indices = np.zeros(n, dtype=int)
    lengths = np.zeros(n, dtype=float)
    dists = np.zeros(n)
    for i in range(1, n):
        dists[i] = np.linalg.norm(points[i] - points[0])
    indices[-1] = 0
    lengths[-1] = np.inf
    for i in range(n - 2, -1, -1):
        k = np.argmax(dists)
        indices[i] = k
        lengths[i] = dists[k]
        dists = np.minimum(dists, np.linalg.norm(points[k] - points, axis=1))
    return indices, lengths

def maximin(points):
    n = np.shape(points)[0]
    indices = np.zeros(n, dtype=int)
    lengths = np.zeros(n, dtype=float)
    dists = np.zeros(n)
    for i in range(1, n):
        dists[i] = np.linalg.norm(points[i] - points[0])
    indices[0] = 0
    lengths[0] = np.inf
    for i in range(1, n):
        k = np.argmax(dists)
        indices[i] = k
        lengths[i] = dists[k]
        dists = np.minimum(dists, np.linalg.norm(points[k] - points, axis=1))
    return indices, lengths

In [None]:
%%latex
Sparsity patterns which can naively be calculated by checking if $||x_i-x_j||_2 <= \rho\min(l_i,l_j)$

<IPython.core.display.Latex object>

In [None]:
def naive_sparsity_pattern(points, lengths, rho):
    n = len(points)
    sparsity = {i : [] for i in range(n)}
    for i in range(n):
        for j in range(i, n):
            if np.linalg.norm(points[i] - points[j]) <= min(lengths[i], lengths[j]) * rho:
                sparsity[i].append(j)
    return sparsity

def kd_sparsity_pattern(points, lengths, rho):
    tree = KDTree(points)
    near = tree.query_ball_point(points, rho * lengths)
    return {i: [j for j in near[i] if j >= i] for i in range(len(points))}

In [17]:
%%latex
Aggregate sparsity pattern into supernode groups such that: $l_j \leq \lambda l_i$

<IPython.core.display.Latex object>

In [10]:
def supernodes(sparsity, lengths, lamb):
    groups = []
    candidates = set(range(len(lengths)))
    agg_sparsity = {}
    i = 0
    while len(candidates) > 0:
        while i not in candidates:
            i += 1
        group = sorted(j for j in sparsity[i] if lengths[j] <= lamb * lengths[i] and j in candidates)
        groups.append(group)
        candidates -= set(group)
        s = sorted({k for j in group for k in sparsity[j]})
        agg_sparsity[group[0]] = s
    return groups, agg_sparsity

In [18]:
%%latex
Naive sparse cholesky factorization using KL-Divergence by using $$L_{s_i} = \frac{\Theta_{s_i,s_i}^{-1}e_1}{\sqrt{e_1^T\Theta_{s_i,s_i}^{-1}e_1}}$$

<IPython.core.display.Latex object>

In [None]:
def col(theta, s):
    m = np.zeros((len(s), len(s)))
    for i in range(len(s)):
        for j in range(len(s)):
            m[i, j] = theta[s[i], s[j]]
    m = np.linalg.inv(m)
    return m[:, 0] / np.sqrt(m[0, 0])

def chol(theta, sparsity):
    n = len(sparsity)
    L = np.zeros((n, n))
    for i in range(n):
        s = sorted(sparsity[i])
        c = col(theta, s)
        for j in range(len(s)):
            L[s[j], i] = c[j]
    return L

def naive_kl_cholesky(points, rho):
    indices, lengths = reverse_maximin(points)
    ordered_points = points[indices]
    sparsity = kd_sparsity_pattern(ordered_points, lengths, rho)
    theta = kernel(ordered_points)
    return chol(theta, sparsity)

In [27]:
%%latex
The aggregated cholesky factorization forms supernodes and then calculates the cholesky factor using $$L_{:, k} = U^{-T}e_k$$
Note that $flip(chol(flip(\Theta))) = (chol(\Theta^{-1}))^{-1}=U^T$

<IPython.core.display.Latex object>

In [13]:
def cols(theta):
    return np.flip(np.linalg.cholesky(np.flip(theta))).T

def aggregate_chol(theta, sparsity, groups):
    n = len(theta)
    L = np.zeros((n, n))
    for group in groups:
        s = sorted(sparsity[group[0]])
        positions = {i: k for k, i in enumerate(s)}
        m = np.zeros((len(s), len(s)))
        for i in range(len(s)):
            for j in range(len(s)):
                m[i, j] = theta[s[i], s[j]]
        L_group = cols(m)
        for i in group:
            k = positions[i]
            e_k = np.zeros(len(s))
            e_k[k] = 1
            col = scipy.linalg.solve_triangular(L_group, e_k, lower=True, check_finite=False)
            for j in range(k, len(s)):
                L[s[j], i] = col[j]
    return L

def aggregated_kl_cholesky(points, rho, lamb):
    indices, lengths = reverse_maximin(points)
    ordered_points = points[indices]
    sparsity = kd_sparsity_pattern(ordered_points, lengths, rho)
    groups, agg_sparsity = supernodes(sparsity, lengths, lamb)
    theta = kernel(ordered_points)
    return aggregate_chol(theta, agg_sparsity, groups)

In [20]:
%%latex
The iterative method finds $L$ using the above methods, creates a new kernel matrix $L^T\Theta L$ and then finds the sparse cholesky factor $L'$. The final result is $LL'$

<IPython.core.display.Latex object>

In [None]:
def naive_iterative_kl_cholesky(points, rho):
    indices, lengths = reverse_maximin(points)
    ordered_points = points[indices]
    sparsity = naive_sparsity_pattern(ordered_points, lengths, rho)
    theta = kernel(ordered_points)
    L = chol(theta, sparsity)
    new_kernel = L.T @ theta @ L
    return L @ chol(new_kernel, sparsity)

def iterative_aggregated_kl_cholesky(points, rho, lamb):
    indices, lengths = reverse_maximin(points)
    ordered_points = points[indices]
    sparsity = naive_sparsity_pattern(ordered_points, lengths, rho)
    groups, agg_sparsity = supernodes(sparsity, lengths, lamb)
    theta = kernel(ordered_points)
    L = aggregate_chol(theta, agg_sparsity, groups)
    new_kernel = L.T @ theta @ L
    return L @ aggregate_chol(new_kernel, agg_sparsity, groups)