# Configuration

In [None]:

import numpy as np
import matplotlib.pyplot as plt
import pods

from gp_ima.ima import C_ima_digamma, C_ima_sample
import GPy
from tueplots import bundles, figsizes

In [None]:
import sys

%load_ext autoreload
%autoreload 2
%matplotlib inline

sys.path.insert(0, '.')

In [None]:
from analysis import plot_typography, estimate2uniform, generate_moebius_data, format_violin, RED, BLUE, calc_mcc

In [None]:
USETEX = True

In [None]:
plt.rcParams.update(bundles.neurips2022(usetex=USETEX))
plt.rcParams.update({
    'text.latex.preamble': [r'\usepackage{amsfonts}', # mathbb
                            r'\usepackage{amsmath}'] # boldsymbol
})

In [None]:
plot_typography(usetex=USETEX, small=12, medium=16, big=20)

# Functions

In [None]:
def train_bayesian_gplvm(X, dim, num_samples_c_ima, num_restarts, num_seeds, seed):
    cimas_sparse = []
    cimas_sparse_prior = []
    zs_sparse = []
    zs_uni_sparse = []
    # np.random.seed(seed)
    for i in range(num_seeds):
        kernel = GPy.kern.RBF(dim, ARD=False) #+ GPy.kern.Bias(dim)
        m = GPy.models.BayesianGPLVM(np.asarray(X), dim, kernel=kernel, num_inducing=20)
        m.likelihood = GPy.likelihoods.Gaussian(variance=1e-6)
        cimas_sparse_prior.append(C_ima_sample(m))
        m.optimize_restarts(num_restarts, optimizer='lbfgs')

        cimas_sparse.append(C_ima_sample(m))
        zs_sparse.append(m.X.mean)
        zs_uni_sparse.append(estimate2uniform(zs_sparse[-1]))

    return cimas_sparse, cimas_sparse_prior, zs_sparse , zs_uni_sparse

def train_gplvm(X, dim, num_samples_c_ima, num_restarts, num_seeds, seed):
    cimas = []
    cimas_prior = []
    zs = []
    zs_uni = []
    # np.random.seed(seed)
    for i in range(num_seeds):
        kernel = GPy.kern.RBF(dim, ARD=False) #+ GPy.kern.Bias(dim)
        m = GPy.models.GPLVM(np.asarray(X), dim, kernel=kernel)
        m.likelihood = GPy.likelihoods.Gaussian(variance=1e-6)
        cimas_prior.append(C_ima_sample(m))
        m.optimize_restarts(num_restarts, optimizer='lbfgs')

        cimas.append(C_ima_sample(m))
        zs.append(m.X.values)
        zs_uni.append(estimate2uniform(zs[-1]))

    return cimas, cimas_prior, zs , zs_uni

def calc_cima_prior_sample(dim, num_data, seed=42):
    np.random.seed(seed)
    cimas_num_data = []
    for n in num_data:
        Z, X, c = generate_moebius_data(n, dim, dim)

        kernel = GPy.kern.RBF(dim, ARD=False) #+ GPy.kern.Bias(dim)
        m = GPy.models.GPLVM(np.asarray(X), dim, kernel=kernel)
        m.likelihood = GPy.likelihoods.Gaussian(variance=1e-6)
        cimas_num_data.append(C_ima_sample(m))
    return cimas_num_data

def train_oil_gplvm(latent_dim, num_samples_c_ima=100, plot=True):
    data = pods.datasets.oil_100()
    Y = data["X"]

    kernel = GPy.kern.RBF(latent_dim, ARD=False) + GPy.kern.Bias(latent_dim)
    oil_gplvm = GPy.models.GPLVM(Y, latent_dim, kernel=kernel)
    oil_gplvm.likelihood = GPy.likelihoods.Gaussian(variance=1e-6)
    oil_gplvm.data_labels = data["Y"].argmax(axis=1)
    cima_oil_prior = C_ima_sample(oil_gplvm)
    oil_gplvm.optimize("scg", messages=0)
    if plot:
        oil_gplvm.plot_latent(labels=oil_gplvm.data_labels)

    cima_oil =  C_ima_sample(oil_gplvm)

    return oil_gplvm, cima_oil_prior, cima_oil

def stick(latent_dim, optimize=True, verbose=False, plot=True, num_samples_c_ima=NUM_SAMPLES_C_IMA, variance=1e-6):
    from matplotlib import pyplot as plt
    import GPy
    import pods

    data = pods.datasets.osu_run1()
    kernel = GPy.kern.RBF(latent_dim, ARD=True) + GPy.kern.Bias(latent_dim)

    # optimize
    m = GPy.models.GPLVM(data["Y"], latent_dim, kernel=kernel)
    m.likelihood = GPy.likelihoods.Gaussian(variance=variance)
    cima_stick_prior =  C_ima_sample(m)
    if optimize:
        m.optimize("bfgs", messages=verbose, max_f_eval=15000)
    cima_stick =  C_ima_sample(m)
    if plot:
        plt.clf
        ax = m.plot_latent()
    return m, cima_stick_prior, cima_stick


# Train models

In [None]:
NUM_DATA = 500
SEED = 42
NUM_SEEDS = 5



## 2D

In [None]:

NUM_SAMPLES_C_IMA = 100
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 2

np.random.seed(SEED)
Z, X, c = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

In [None]:

NUM_SAMPLES_C_IMA = 100
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 2

np.random.seed(SEED)
Z, X, c = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

cimas_sparse_2d, cimas_sparse_prior_2d, zs_sparse_2d, zs_uni_sparse_2d = train_bayesian_gplvm(X, DIM, NUM_SAMPLES_C_IMA,
                                                                                              NUM_RESTARTS, NUM_SEEDS,
                                                                                              SEED)
mccs_sparse_2d = [calc_mcc(z, Z) for z in zs_uni_sparse_2d]

NUM_RESTARTS = 2
cimas_2d, cimas_prior_2d, zs_2d , zs_uni_2d = train_gplvm(X, DIM, NUM_SAMPLES_C_IMA, NUM_RESTARTS, NUM_SEEDS, SEED)
mccs_2d = [calc_mcc(z, Z) for z in zs_uni_2d]

In [None]:
LABELPAD = 1
TICK_PADDING = 2
IDX = 0
IDX_SPARSE = 0
fig = plt.figure(figsize=figsizes.neurips2022(nrows=1, ncols=2, rel_width=1)['figure.figsize'])

ax = fig.add_subplot(131)
ax.scatter(Z[:, 0], Z[:, 1], c=c, cmap="hsv", label="Latents")

ax2 = fig.add_subplot(132)
ax2.scatter(X[:, 0], X[:, 1], c=c, cmap="hsv", label="Observations")

# ax3 = fig.add_subplot(143)
# ax3.scatter(zs_uni_2d[IDX][:, 0], zs_uni_2d[IDX][:, 1], c=c, cmap="hsv", label="Rec. (GPLVM)")


ax4 = fig.add_subplot(133)
ax4.scatter(zs_uni_sparse_2d[IDX_SPARSE][:, 0], zs_uni_sparse_2d[IDX_SPARSE][:, 1], c=c, cmap="hsv", label="Reconstruction")


# Remove ticks and labels and set which side to label
ticksoff = dict(labelleft=False, labelright=False, left=False, right=False, labelbottom=False, bottom=False)
ax.tick_params(axis="both", **ticksoff)
ax2.tick_params(axis="both", **ticksoff)
# ax3.tick_params(axis="both", **ticksoff)
ax4.tick_params(axis="both", **ticksoff)

ax.set_title("Latents")
ax2.set_title("Observations")
# ax3.set_title("Rec. (GPLVM)")
ax4.set_title("Reconstruction")



plt.savefig("gplvm_ima.svg")

## 3D

In [None]:
NUM_SEEDS = 5
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 3

np.random.seed(SEED)
Z, X, c = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

# cimas_sparse_3d, cimas_sparse_prior_3d, zs_sparse_3d, zs_uni_sparse_3d = train_bayesian_gplvm(X, DIM, NUM_SAMPLES_C_IMA,
#                                                                                               NUM_RESTARTS, NUM_SEEDS,
#                                                                                               SEED)
# mccs_sparse_3d = [calc_mcc(z, Z) for z in zs_uni_sparse_3d]

NUM_RESTARTS = 2
cimas_3d, cimas_prior_3d, zs_3d , zs_uni_3d = train_gplvm(X, DIM, NUM_SAMPLES_C_IMA, NUM_RESTARTS, NUM_SEEDS, SEED)
mccs_3d = [calc_mcc(z, Z) for z in zs_uni_3d]

## 5D

In [None]:
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 5

np.random.seed(SEED)
Z, X, c = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

# cimas_sparse_5d, cimas_sparse_prior_5d, zs_sparse_5d, zs_uni_sparse_5d = train_bayesian_gplvm(X, DIM, NUM_SAMPLES_C_IMA,
#                                                                                               NUM_RESTARTS, NUM_SEEDS,
#                                                                                               SEED)
# mccs_sparse_5d = [calc_mcc(z, Z) for z in zs_uni_sparse_5d]

NUM_RESTARTS = 2
cimas_5d, cimas_prior_5d, zs_5d , zs_uni_5d = train_gplvm(X, DIM, NUM_SAMPLES_C_IMA, NUM_RESTARTS, NUM_SEEDS, SEED)
mccs_5d = [calc_mcc(z, Z) for z in zs_uni_5d]

## 8D

In [None]:
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 8

np.random.seed(SEED)
Z, X, c = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

# cimas_sparse_8d, cimas_sparse_prior_8d, zs_sparse_8d, zs_uni_sparse_8d = train_bayesian_gplvm(X, DIM, NUM_SAMPLES_C_IMA,
#                                                                                               NUM_RESTARTS, NUM_SEEDS,
#                                                                                               SEED)
# mccs_sparse_8d = [calc_mcc(z, Z) for z in zs_uni_sparse_8d]

NUM_RESTARTS = 2
cimas_8d, cimas_prior_8d, zs_8d , zs_uni_8d = train_gplvm(X, DIM, NUM_SAMPLES_C_IMA, NUM_RESTARTS, NUM_SEEDS, SEED)
mccs_8d = [calc_mcc(z, Z) for z in zs_uni_8d]

## 10D

In [None]:
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 10

np.random.seed(SEED)
Z, X, c = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

# cimas_sparse_10d, cimas_sparse_prior_10d, zs_sparse_10d, zs_uni_sparse_10d = train_bayesian_gplvm(X, DIM,
#                                                                                                   NUM_SAMPLES_C_IMA,
#                                                                                                   NUM_RESTARTS,
#                                                                                                   NUM_SEEDS, SEED)
# mccs_sparse_10d = [calc_mcc(z, Z) for z in zs_uni_sparse_10d]

NUM_RESTARTS = 2
cimas_10d, cimas_prior_10d, zs_10d , zs_uni_10d = train_gplvm(X, DIM, NUM_SAMPLES_C_IMA, NUM_RESTARTS, NUM_SEEDS, SEED)
mccs_10d = [calc_mcc(z, Z) for z in zs_uni_10d]

In [None]:
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 12
NUM_DATA = 500

np.random.seed(SEED)
Z, X, c = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

# cimas_sparse_10d, cimas_sparse_prior_10d, zs_sparse_10d, zs_uni_sparse_10d = train_bayesian_gplvm(X, DIM,
#                                                                                                   NUM_SAMPLES_C_IMA,
#                                                                                                   NUM_RESTARTS,
#                                                                                                   NUM_SEEDS, SEED)
# mccs_sparse_10d = [calc_mcc(z, Z) for z in zs_uni_sparse_10d]

NUM_RESTARTS = 2
cimas_12d, cimas_prior_12d, zs_12d , zs_uni_12d = train_gplvm(X, DIM, NUM_SAMPLES_C_IMA, NUM_RESTARTS, 2, SEED)
mccs_12d = [calc_mcc(z, Z) for z in zs_uni_12d]

In [None]:
try:
    cimas = [cimas_2d, cimas_3d, cimas_5d, cimas_8d, cimas_10d]
    cimas_prior = [cimas_prior_2d, cimas_prior_3d, cimas_prior_5d, cimas_prior_8d, cimas_prior_10d]

    np.savez("cimas.npz", cimas=cimas, cimas_prior=cimas_prior)
except:
    cimas = np.load("cimas.npz", allow_pickle=True)['cimas']
    cimas_prior = np.load("cimas.npz", allow_pickle=True)['cimas_prior']

In [None]:
try:
    mccs = [mccs_2d, mccs_3d, mccs_5d, mccs_8d, mccs_10d]
    np.savez("mccs.npz", mccs=mccs)
except:
    mccs = np.load("mccs.npz", allow_pickle=True)['mccs']

## Plot MCC and CIMA

In [None]:
LABELPAD = 2
TICK_PADDING = 0
dimensions = np.array([2,3,5,8,10])

fig = plt.figure(figsize=figsizes.neurips2022(nrows=2, ncols=3)['figure.figsize'])


ax = fig.add_subplot(121)
ax.grid(True, which="both", ls="-.")


ax.scatter(dimensions,[np.log10(g).mean() for g in cimas_prior], c=BLUE, label="Prior")
ax.scatter(dimensions,[np.log10(g).mean() for g in cimas], c=RED, label="Posterior")

Ds = np.linspace(1, 10, 10).astype(int)
lp = ax.plot(Ds, [np.log10(C_ima_digamma(D, D)) for D in Ds], label="Bound", c="black")

ax.set_ylabel("$\log_{10}c_{\mathrm{IMA}}$", labelpad=LABELPAD)
ax.set_xlabel("$d=D$", labelpad=LABELPAD)
ax.set_xticks(dimensions.tolist())
ax.set_xticklabels(dimensions.tolist())
plt.legend( loc='center right')

ax2 = fig.add_subplot(122)

ax2.grid(True, which="both", ls="-.")

ax2.scatter(dimensions, np.array(mccs).mean(1), c=RED, label="MCC")


ax2.set_ylabel("$\mathrm{MCC}$", labelpad=LABELPAD)
ax2.set_xlabel("$d=D$", labelpad=LABELPAD)

ax2.set_xticks(dimensions.tolist())
ax2.set_xticklabels(dimensions.tolist())

plt.legend(loc='center right')

plt.savefig("cima_mcc.svg")


# Oil 100 dataset

In [None]:
oil_gplvm_2d, cima_oil_prior_2d, cima_oil_2d = train_oil_gplvm(2, plot=True)

In [None]:
oil_gplvm_3d, cima_oil_prior_3d, cima_oil_3d = train_oil_gplvm(3, plot=True)

In [None]:
oil_gplvm_4d, cima_oil_prior_4d, cima_oil_4d = train_oil_gplvm(4, plot=True)

In [None]:
oil_gplvm_5d, cima_oil_prior_5d, cima_oil_5d = train_oil_gplvm(5, plot=True)

In [None]:
oil_gplvm_6d, cima_oil_prior_6d, cima_oil_6d = train_oil_gplvm(6, plot=True)

In [None]:
LABELPAD = 2
TICK_PADDING = 0
cimas_oil = [cima_oil_2d, cima_oil_3d, cima_oil_4d, cima_oil_5d, cima_oil_6d]
cimas_oil_prior = [cima_oil_prior_2d, cima_oil_prior_3d, cima_oil_prior_4d, cima_oil_prior_5d,cima_oil_prior_6d]
dimensions = np.array([2,3,4,5,6])

fig = plt.figure(figsize=figsizes.neurips2022(nrows=1, ncols=1)['figure.figsize'])


"""MCC vs CIMA over different gamma"""
ax = fig.add_subplot(111)
ax.grid(True, which="both", ls="-.")


# MCC
ax.scatter(dimensions.tolist(),[np.log10(g).mean() for g in cimas_oil_prior], c=BLUE, label="Prior")
ax.scatter(dimensions,[np.log10(g).mean() for g in cimas_oil], c=RED, label="Posterior")

lp = ax.plot(dimensions, [np.log10(C_ima_digamma(d, 12)) for d in dimensions], label="Bound", c="black")

ax.set_ylabel("$\log_{10}c_{\mathrm{IMA}}$", labelpad=LABELPAD)
ax.set_xlabel("$d$", labelpad=LABELPAD)
ax.set_xticks(dimensions.tolist())
ax.set_xticklabels(dimensions.tolist())
plt.legend( loc='lower right')


plt.savefig("cima_oil.svg")


# Stick dataset

In [None]:
stick_2d, cima_stick_prior_2d, cima_stick_2d = stick(2)
cima_stick_prior_2d, cima_stick_2d, C_ima_digamma(2, 102)

In [None]:
stick_3d, cima_stick_prior_3d, cima_stick_3d = stick(3, variance=7e-5)
cima_stick_prior_3d, cima_stick_3d, C_ima_digamma(3, 102)

In [None]:
stick_4d, cima_stick_prior_4d, cima_stick_4d = stick(4)
cima_stick_prior_4d, cima_stick_4d, C_ima_digamma(4, 102)

In [None]:
stick_6d, cima_stick_prior_6d, cima_stick_6d = stick(6)
cima_stick_prior_6d, cima_stick_6d, C_ima_digamma(6, 102)

In [None]:
stick_10d, cima_stick_prior_10d, cima_stick_10d = stick(10)
cima_stick_10d, cima_stick_prior_10d, C_ima_digamma(10, 102)

In [None]:
stick_20d, cima_stick_prior_20d, cima_stick_20d = stick(20)
cima_stick_20d, cima_stick_prior_20d, C_ima_digamma(20, 102)

In [None]:
stick_40d, cima_stick_prior_40d, cima_stick_40d = stick(40)
cima_stick_40d, cima_stick_prior_40d, C_ima_digamma(40, 102)

In [None]:
stick_60d, cima_stick_prior_60d, cima_stick_60d = stick(60)
cima_stick_60d, cima_stick_prior_60d, C_ima_digamma(60, 102)

In [None]:
LABELPAD = 2
TICK_PADDING = 0
cimas_stick = [cima_stick_2d, cima_stick_3d, cima_stick_4d, cima_stick_6d, cima_stick_10d, cima_stick_20d, cima_stick_40d]
cimas_stick_prior = [cima_stick_prior_2d, cima_stick_prior_3d, cima_stick_prior_4d, cima_stick_prior_6d, cima_stick_prior_10d, cima_stick_prior_20d, cima_stick_prior_40d]
dimensions = np.array([2,3,4,6,10, 20, 40])

fig = plt.figure(figsize=figsizes.neurips2022(nrows=1, ncols=1)['figure.figsize'])


"""MCC vs CIMA over different gamma"""
ax = fig.add_subplot(111)
ax.grid(True, which="both", ls="-.")


# MCC
ax.scatter(dimensions.tolist(),[np.log10(g).mean() for g in cimas_stick_prior], c=BLUE, label="Prior")
ax.scatter(dimensions,[np.log10(g).mean() for g in cimas_stick], c=RED, label="Posterior")

lp = ax.plot(dimensions, [np.log10(C_ima_digamma(d, 102)) for d in dimensions], label="Bound", c="black")

ax.set_ylabel("$\log_{10}c_{\mathrm{IMA}}$", labelpad=LABELPAD)
ax.set_xlabel("$d$", labelpad=LABELPAD)
ax.set_xscale('log')
ax.set_xticks(dimensions.tolist())
ax.set_xticklabels(dimensions.tolist())


plt.legend( loc='lower right')


plt.savefig("cima_stick.svg")

# Synthetic high-dimensional observations

In [None]:
NUM_SEEDS = 5
NUM_RESTARTS = 5
DIM = LATENT_DIM = 3
OBS_DIM = 8
NUM_DATA_A = 2500

np.random.seed(SEED)
Z, X, c = generate_moebius_data(NUM_DATA_A, LATENT_DIM, LATENT_DIM)

A= np.random.randn(OBS_DIM, LATENT_DIM)

XA = X@A.T


In [None]:
NUM_RESTARTS = 2
cimas_A, cimas_prior_A, zs_A , zs_uni_A = train_gplvm(XA, LATENT_DIM, NUM_SAMPLES_C_IMA, NUM_RESTARTS, 1, SEED)
mccs_A = [calc_mcc(z, Z) for z in zs_uni_A]

In [None]:
cimas_A, cimas_prior_A, C_ima_digamma(LATENT_DIM, OBS_DIM)

In [None]:
LATENT_DIM = 5
np.random.seed(SEED)
Z, X, c = generate_moebius_data(500, LATENT_DIM, LATENT_DIM)
kernel = GPy.kern.RBF(LATENT_DIM, ARD=False) #+ GPy.kern.Bias(dim)
m = GPy.models.GPLVM(np.asarray(XA), LATENT_DIM, kernel=kernel)
m.likelihood = GPy.likelihoods.Gaussian(variance=1e-6)
C_ima_sample(m)

# Data-dependence of prior CIMA

In [None]:
num_data = [50, 100, 200, 500, 10000]
DIM = LATENT_DIM = OBS_DIM = 2



In [None]:
cimas_num_data2d = calc_cima_prior_sample(2, num_data)

In [None]:
cimas_num_data3d = calc_cima_prior_sample(3, num_data)

In [None]:
cimas_num_data5d = calc_cima_prior_sample(5, num_data)

In [None]:
cimas_num_data8d = calc_cima_prior_sample(8, num_data)

In [None]:
cimas_num_data10d = calc_cima_prior_sample(10, num_data)

In [None]:
try:
    cimas_num_data2d = np.load("cimas_num_data.npz")['cimas_num_data2d']
    cimas_num_data3d = np.load("cimas_num_data.npz")['cimas_num_data3d']
    cimas_num_data5d = np.load("cimas_num_data.npz")['cimas_num_data5d']
    cimas_num_data8d = np.load("cimas_num_data.npz")['cimas_num_data8d']
    cimas_num_data10d = np.load("cimas_num_data.npz")['cimas_num_data10d']
    cimas_num_data = [cimas_num_data2d, cimas_num_data3d, cimas_num_data5d, cimas_num_data8d, cimas_num_data10d]
except:
    cimas_num_data2d = calc_cima_prior_sample(2, num_data)
    cimas_num_data3d = calc_cima_prior_sample(3, num_data)
    cimas_num_data5d = calc_cima_prior_sample(5, num_data)
    cimas_num_data8d = calc_cima_prior_sample(8, num_data)
    cimas_num_data10d = calc_cima_prior_sample(10, num_data)
    np.savez("cimas_num_data.npz", cimas_num_data2d=cimas_num_data2d, cimas_num_data3d=cimas_num_data3d, cimas_num_data5d=cimas_num_data5d, cimas_num_data8d=cimas_num_data8d, cimas_num_data10d=cimas_num_data10d)

In [None]:
LABELPAD = 2
TICK_PADDING = 0
dimensions = np.array([2,3,5,8,10])
cimas_num_data = [cimas_num_data2d, cimas_num_data3d, cimas_num_data5d, cimas_num_data8d, cimas_num_data10d]

fig = plt.figure(figsize=figsizes.neurips2022(nrows=2, ncols=3)['figure.figsize'])


ax = fig.add_subplot(111)
ax.grid(True, which="both", ls="-.")

for d,c in zip(dimensions, cimas_num_data):
    ax.scatter(d*np.ones_like(c),[np.log10(g) for g in c], label=f"{d}D")



Ds = np.linspace(1, 10, 10).astype(int)
lp = ax.plot(Ds, [np.log10(C_ima_digamma(D, D)) for D in Ds], label="Bound", c="black")


ax.set_ylabel("$\log_{10}c_{\mathrm{IMA}}$", labelpad=LABELPAD)
ax.set_xlabel("$d=D$", labelpad=LABELPAD)
ax.set_xticks(dimensions)
ax.set_xticklabels(dimensions)
plt.legend( loc='center right')



plt.savefig("cima_num_data.svg")
