# Configuration

In [None]:

import numpy as np
import matplotlib.pyplot as plt
import pods

from gp_ima.ima import C_ima_digamma, C_ima_sample
import GPy
from tueplots import bundles, figsizes

In [None]:
import sys

%load_ext autoreload
%autoreload 2
%matplotlib inline

sys.path.insert(0, '.')

In [None]:
from analysis import plot_typography, estimate2uniform, generate_moebius_data, format_violin, RED, BLUE, calc_mcc

In [None]:
USETEX = True

In [None]:
plt.rcParams.update(bundles.neurips2022(usetex=USETEX))
plt.rcParams.update({
    'text.latex.preamble': [r'\usepackage{amsfonts}', # mathbb
                            r'\usepackage{amsmath}'] # boldsymbol
})

In [None]:
plot_typography(usetex=USETEX, small=12, medium=16, big=20)

# Functions

In [None]:
def train_bayesian_gplvm(X, dim, num_samples_c_ima, num_restarts, num_seeds, seed):
    cimas_sparse = []
    cimas_sparse_prior = []
    zs_sparse = []
    zs_uni_sparse = []
    # np.random.seed(seed)
    for i in range(num_seeds):
        kernel = GPy.kern.RBF(dim, ARD=False) #+ GPy.kern.Bias(dim)
        m = GPy.models.BayesianGPLVM(np.asarray(X), dim, kernel=kernel, num_inducing=20)
        m.likelihood = GPy.likelihoods.Gaussian(variance=1e-6)
        cimas_sparse_prior.append(C_ima_sample(m))
        m.optimize_restarts(num_restarts, optimizer='lbfgs')

        cimas_sparse.append(C_ima_sample(m))
        zs_sparse.append(m.X.mean)
        zs_uni_sparse.append(estimate2uniform(zs_sparse[-1]))

    return cimas_sparse, cimas_sparse_prior, zs_sparse , zs_uni_sparse

def train_gplvm(X, dim, num_samples_c_ima, num_restarts, num_seeds, seed):
    cimas = []
    cimas_prior = []
    zs = []
    zs_uni = []
    # np.random.seed(seed)
    for i in range(num_seeds):
        kernel = GPy.kern.RBF(dim, ARD=False) #+ GPy.kern.Bias(dim)
        m = GPy.models.GPLVM(np.asarray(X), dim, kernel=kernel)
        m.likelihood = GPy.likelihoods.Gaussian(variance=1e-6)
        cimas_prior.append(C_ima_sample(m))
        m.optimize_restarts(num_restarts, optimizer='lbfgs')

        cimas.append(C_ima_sample(m))
        zs.append(m.X.values)
        zs_uni.append(estimate2uniform(zs[-1]))

    return cimas, cimas_prior, zs , zs_uni

def calc_cima_prior_sample(dim, num_data, seed=42):
    np.random.seed(seed)
    cimas_num_data = []
    for n in num_data:
        Z, X, c = generate_moebius_data(n, dim, dim)

        kernel = GPy.kern.RBF(dim, ARD=False) + GPy.kern.Bias(dim)
        m = GPy.models.GPLVM(np.asarray(X), dim, kernel=kernel)
        m.likelihood = GPy.likelihoods.Gaussian(variance=1e-6)
        cimas_num_data.append(C_ima_sample(m))
    return cimas_num_data


# Möbius transform

In [None]:
NUM_DATA = 500
SEED = 42
NUM_SEEDS = 5



## 2D

In [None]:

NUM_SAMPLES_C_IMA = 100
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 2

np.random.seed(SEED)
Z, X, cima = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

In [None]:

NUM_SAMPLES_C_IMA = 100
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 2

np.random.seed(SEED)
Z, X, cima = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

cimas_sparse_2d, cimas_sparse_prior_2d, zs_sparse_2d, zs_uni_sparse_2d = train_bayesian_gplvm(X, DIM, NUM_SAMPLES_C_IMA,
                                                                                              NUM_RESTARTS, NUM_SEEDS,
                                                                                              SEED)
mccs_sparse_2d = [calc_mcc(z, Z) for z in zs_uni_sparse_2d]

NUM_RESTARTS = 2
cimas_2d, cimas_prior_2d, zs_2d , zs_uni_2d = train_gplvm(X, DIM, NUM_SAMPLES_C_IMA, NUM_RESTARTS, NUM_SEEDS, SEED)
mccs_2d = [calc_mcc(z, Z) for z in zs_uni_2d]

In [None]:
LABELPAD = 1
TICK_PADDING = 2
IDX = 0
IDX_SPARSE = 0
fig = plt.figure(figsize=figsizes.neurips2022(nrows=1, ncols=2, rel_width=1)['figure.figsize'])

ax = fig.add_subplot(131)
ax.scatter(Z[:, 0], Z[:, 1], c=cima, cmap="hsv", label="Latents")

ax2 = fig.add_subplot(132)
ax2.scatter(X[:, 0], X[:, 1], c=cima, cmap="hsv", label="Observations")

# ax3 = fig.add_subplot(143)
# ax3.scatter(zs_uni_2d[IDX][:, 0], zs_uni_2d[IDX][:, 1], c=c, cmap="hsv", label="Rec. (GPLVM)")


ax4 = fig.add_subplot(133)
ax4.scatter(zs_uni_sparse_2d[IDX_SPARSE][:, 0], zs_uni_sparse_2d[IDX_SPARSE][:, 1], c=cima, cmap="hsv", label="Reconstruction")


# Remove ticks and labels and set which side to label
ticksoff = dict(labelleft=False, labelright=False, left=False, right=False, labelbottom=False, bottom=False)
ax.tick_params(axis="both", **ticksoff)
ax2.tick_params(axis="both", **ticksoff)
# ax3.tick_params(axis="both", **ticksoff)
ax4.tick_params(axis="both", **ticksoff)

ax.set_title("Latents")
ax2.set_title("Observations")
# ax3.set_title("Rec. (GPLVM)")
ax4.set_title("Reconstruction")



plt.savefig("gplvm_ima.svg")

## 3D

In [None]:
NUM_SEEDS = 5
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 3

np.random.seed(SEED)
Z, X, cima = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

# cimas_sparse_3d, cimas_sparse_prior_3d, zs_sparse_3d, zs_uni_sparse_3d = train_bayesian_gplvm(X, DIM, NUM_SAMPLES_C_IMA,
#                                                                                               NUM_RESTARTS, NUM_SEEDS,
#                                                                                               SEED)
# mccs_sparse_3d = [calc_mcc(z, Z) for z in zs_uni_sparse_3d]

NUM_RESTARTS = 2
cimas_3d, cimas_prior_3d, zs_3d , zs_uni_3d = train_gplvm(X, DIM, NUM_SAMPLES_C_IMA, NUM_RESTARTS, NUM_SEEDS, SEED)
mccs_3d = [calc_mcc(z, Z) for z in zs_uni_3d]

## 5D

In [None]:
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 5

np.random.seed(SEED)
Z, X, cima = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

# cimas_sparse_5d, cimas_sparse_prior_5d, zs_sparse_5d, zs_uni_sparse_5d = train_bayesian_gplvm(X, DIM, NUM_SAMPLES_C_IMA,
#                                                                                               NUM_RESTARTS, NUM_SEEDS,
#                                                                                               SEED)
# mccs_sparse_5d = [calc_mcc(z, Z) for z in zs_uni_sparse_5d]

NUM_RESTARTS = 2
cimas_5d, cimas_prior_5d, zs_5d , zs_uni_5d = train_gplvm(X, DIM, NUM_SAMPLES_C_IMA, NUM_RESTARTS, NUM_SEEDS, SEED)
mccs_5d = [calc_mcc(z, Z) for z in zs_uni_5d]

## 8D

In [None]:
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 8

np.random.seed(SEED)
Z, X, cima = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

# cimas_sparse_8d, cimas_sparse_prior_8d, zs_sparse_8d, zs_uni_sparse_8d = train_bayesian_gplvm(X, DIM, NUM_SAMPLES_C_IMA,
#                                                                                               NUM_RESTARTS, NUM_SEEDS,
#                                                                                               SEED)
# mccs_sparse_8d = [calc_mcc(z, Z) for z in zs_uni_sparse_8d]

NUM_RESTARTS = 2
cimas_8d, cimas_prior_8d, zs_8d , zs_uni_8d = train_gplvm(X, DIM, NUM_SAMPLES_C_IMA, NUM_RESTARTS, NUM_SEEDS, SEED)
mccs_8d = [calc_mcc(z, Z) for z in zs_uni_8d]

## 10D

In [None]:
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 10

np.random.seed(SEED)
Z, X, cima = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

# cimas_sparse_10d, cimas_sparse_prior_10d, zs_sparse_10d, zs_uni_sparse_10d = train_bayesian_gplvm(X, DIM,
#                                                                                                   NUM_SAMPLES_C_IMA,
#                                                                                                   NUM_RESTARTS,
#                                                                                                   NUM_SEEDS, SEED)
# mccs_sparse_10d = [calc_mcc(z, Z) for z in zs_uni_sparse_10d]

NUM_RESTARTS = 2
cimas_10d, cimas_prior_10d, zs_10d , zs_uni_10d = train_gplvm(X, DIM, NUM_SAMPLES_C_IMA, NUM_RESTARTS, NUM_SEEDS, SEED)
mccs_10d = [calc_mcc(z, Z) for z in zs_uni_10d]

In [None]:
NUM_RESTARTS = 5
DIM = LATENT_DIM = OBS_DIM = 12
NUM_DATA = 500

np.random.seed(SEED)
Z, X, cima = generate_moebius_data(NUM_DATA, LATENT_DIM, OBS_DIM)

# cimas_sparse_10d, cimas_sparse_prior_10d, zs_sparse_10d, zs_uni_sparse_10d = train_bayesian_gplvm(X, DIM,
#                                                                                                   NUM_SAMPLES_C_IMA,
#                                                                                                   NUM_RESTARTS,
#                                                                                                   NUM_SEEDS, SEED)
# mccs_sparse_10d = [calc_mcc(z, Z) for z in zs_uni_sparse_10d]

NUM_RESTARTS = 2
cimas_12d, cimas_prior_12d, zs_12d , zs_uni_12d = train_gplvm(X, DIM, NUM_SAMPLES_C_IMA, NUM_RESTARTS, 2, SEED)
mccs_12d = [calc_mcc(z, Z) for z in zs_uni_12d]

In [None]:
try:
    cimas = [cimas_2d, cimas_3d, cimas_5d, cimas_8d, cimas_10d]
    cimas_prior = [cimas_prior_2d, cimas_prior_3d, cimas_prior_5d, cimas_prior_8d, cimas_prior_10d]

    np.savez("cimas.npz", cimas=cimas, cimas_prior=cimas_prior)
except:
    cimas = np.load("cimas.npz", allow_pickle=True)['cimas']
    cimas_prior = np.load("cimas.npz", allow_pickle=True)['cimas_prior']

In [None]:
try:
    mccs = [mccs_2d, mccs_3d, mccs_5d, mccs_8d, mccs_10d]
    np.savez("mccs.npz", mccs=mccs)
except:
    mccs = np.load("mccs.npz", allow_pickle=True)['mccs']

## Plot MCC and CIMA

In [None]:
LABELPAD = 2
TICK_PADDING = 0
dimensions = np.array([2,3,5,8,10])

fig = plt.figure(figsize=figsizes.neurips2022(nrows=2, ncols=3)['figure.figsize'])


ax = fig.add_subplot(121)
ax.grid(True, which="both", ls="-.")


ax.scatter(dimensions,[np.log10(g).mean() for g in cimas_prior], c=BLUE, label="Prior")
ax.scatter(dimensions,[np.log10(g).mean() for g in cimas], c=RED, label="Posterior")

Ds = np.linspace(1, 10, 10).astype(int)
lp = ax.plot(Ds, [np.log10(C_ima_digamma(D, D)) for D in Ds], label="Bound", c="black")

ax.set_ylabel("$\log_{10}c_{\mathrm{IMA}}$", labelpad=LABELPAD)
ax.set_xlabel("$d=D$", labelpad=LABELPAD)
ax.set_xticks(dimensions.tolist())
ax.set_xticklabels(dimensions.tolist())
plt.legend( loc='center right')

ax2 = fig.add_subplot(122)

ax2.grid(True, which="both", ls="-.")

ax2.scatter(dimensions, np.array(mccs).mean(1), c=RED, label="MCC")


ax2.set_ylabel("$\mathrm{MCC}$", labelpad=LABELPAD)
ax2.set_xlabel("$d=D$", labelpad=LABELPAD)

ax2.set_xticks(dimensions.tolist())
ax2.set_xticklabels(dimensions.tolist())

plt.legend(loc='center right')

plt.savefig("cima_mcc.svg")


# Data-dependence of prior CIMA

In [None]:
num_data = [50, 100, 200, 500, 1000, 2000]
DIM = LATENT_DIM = OBS_DIM = 2



In [None]:
cimas_num_data2d = calc_cima_prior_sample(2, num_data)

In [None]:
cimas_num_data3d = calc_cima_prior_sample(3, num_data)

In [None]:
cimas_num_data5d = calc_cima_prior_sample(5, num_data)

In [None]:
cimas_num_data8d = calc_cima_prior_sample(8, num_data)

In [None]:
cimas_num_data10d = calc_cima_prior_sample(10, num_data)

In [None]:
try:
    cimas_num_data2d = np.load("cimas_num_data.npz")['cimas_num_data2d']
    cimas_num_data3d = np.load("cimas_num_data.npz")['cimas_num_data3d']
    cimas_num_data5d = np.load("cimas_num_data.npz")['cimas_num_data5d']
    cimas_num_data8d = np.load("cimas_num_data.npz")['cimas_num_data8d']
    cimas_num_data10d = np.load("cimas_num_data.npz")['cimas_num_data10d']
    cimas_num_data = [cimas_num_data2d, cimas_num_data3d, cimas_num_data5d, cimas_num_data8d, cimas_num_data10d]
except:
    cimas_num_data2d = calc_cima_prior_sample(2, num_data)
    cimas_num_data3d = calc_cima_prior_sample(3, num_data)
    cimas_num_data5d = calc_cima_prior_sample(5, num_data)
    cimas_num_data8d = calc_cima_prior_sample(8, num_data)
    cimas_num_data10d = calc_cima_prior_sample(10, num_data)
    np.savez("cimas_num_data.npz", cimas_num_data2d=cimas_num_data2d, cimas_num_data3d=cimas_num_data3d, cimas_num_data5d=cimas_num_data5d, cimas_num_data8d=cimas_num_data8d, cimas_num_data10d=cimas_num_data10d)

In [None]:
LABELPAD = 2
TICK_PADDING = 0
dimensions = np.array([2,3,5,8,10])
cimas_num_data = [cimas_num_data2d, cimas_num_data3d, cimas_num_data5d, cimas_num_data8d, cimas_num_data10d]

fig = plt.figure(figsize=figsizes.neurips2022(nrows=1, ncols=1)['figure.figsize'])


ax = fig.add_subplot(111)
ax.grid(True, which="both", ls="-.")

markers = ["o", "s", "D", "X", "P"]
colors = ["blue", "orange", "green", "red", "purple"]

for d,c, cima in zip(dimensions, colors, cimas_num_data):
    for m, cima_per_num_data in zip(markers,cima):
        ax.scatter(d, np.log10(cima_per_num_data), marker=m, c=c, s=20)



Ds = np.linspace(1, 10, 10).astype(int)
lp = ax.plot(Ds, [np.log10(C_ima_digamma(D, D)) for D in Ds], label="Bound", c="black")


ax.set_ylabel("$\log_{10}c_{\mathrm{IMA}}$", labelpad=LABELPAD)
ax.set_xlabel("$d=D$", labelpad=LABELPAD)
ax.set_xticks(dimensions)
ax.set_xticklabels(dimensions)

import matplotlib.lines as mlines
legend_elements = [mlines.Line2D([0], [0], marker=m, color='w', label=f"${n}$",
                          markerfacecolor='black', markersize=10) for n,m in zip(num_data, markers)
                   ]
legend_elements += [mlines.Line2D([], [], color='black', linestyle='solid', label=f"Bound",
                          markersize=10)]

legend_marker =  ax.legend(handles=legend_elements, loc='lower right')




# add legend describing the colors
legend_elements = [mlines.Line2D([0], [0], marker='o', color='w', label=f"{d}",
                          markerfacecolor=c, markersize=10) for d,c in zip(dimensions, colors)
                   ]

legend_dim= ax.legend(handles=legend_elements, loc='lower center', ncol=len(dimensions), handlelength=0.5, columnspacing=0.5)


ax.add_artist(legend_dim)
ax.add_artist(legend_marker)


plt.savefig("cima_num_data.svg")
