## Experiment parameters

In [None]:
# 'save' or 'rerun'
save_or_rerun = 'save'

# 'oil flow' or 'USPS digits'
dataset = 'oil flow'

## Model Parameters

In [None]:
m = 20
hermite_points = 2
montecarlo_runs = 10
variational_variance = 0.1

In [None]:
import datetime
from pathlib import Path
experiment_key = datetime.datetime.now().strftime("%Y%m%dT%H%M%S")
experiment_folder = Path('results')
print('experiment key:', experiment_key)

---

In [None]:
import os
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.FATAL)
import numpy as np
import pandas as pd
import sklearn as sk
import sklearn.neighbors
import scipy.io as sio
import gpflow
import IPython.display as ipd
import matplotlib as mpl
import matplotlib.pyplot as plt

import library.kernels
from library.expectations import UnscentedExpectation, AnalyticExpectation, GaussHermiteExpectation, MonteCarloExpectation
from library.gplvm import GPLVM

%matplotlib inline

In [None]:
def plot_image_scatter(ax, data, images, image_zoom=1, hide_overlaps=True, color=(1,1,1), previously_drawn_boxes=None):
    wh = None
    drawn_boxes = previously_drawn_boxes or []
    drawn_indices = []
    r,g,b = color
    
    ax.update_datalim(data)
    ax.autoscale()
    for pos, image in zip(data, images):
        image = np.clip(np.dstack([(1-image)*r, (1-image)*g, (1-image)*b, image]),0,1)
        img = mpl.offsetbox.AnnotationBbox(
            mpl.offsetbox.OffsetImage(image, zoom=image_zoom),
            pos, xycoords='data', frameon=False
        )
        
        if wh is not None:
            bb = mpl.transforms.Bbox.from_bounds(*(pos-wh/2), *wh)
            if hide_overlaps and bb.count_overlaps(drawn_boxes) > 0:
                drawn_indices.append(False)
                continue
        
        ax.add_artist(img)
        
        if wh is None:
            ax.figure.canvas.draw()
            a = (ax.transData.inverted().transform(img.get_children()[1].get_bbox()))
            wh = np.array([a[1,0]-a[0,0], a[1,1]-a[0,1]])
            bb = mpl.transforms.Bbox.from_bounds(*(pos-wh/2), *wh)
        drawn_boxes.append(bb)
        drawn_indices.append(True)
    return np.array(drawn_indices), drawn_boxes

def kernel_name(k):
    if type(k) is gpflow.kernels.Sum:
        return '+'.join([kernel_name(k) for k in k.kernels])
    if type(k).__name__ == 'MLP':
        return f'MLP{k.layers}'
    else:
        return type(k).__name__

In [None]:
dataset_path_friendly = dataset.replace(' ','_')
if save_or_rerun not in ['save','rerun']:
    raise Exception(f'Invalid operation {save_or_rerun}')
if dataset == 'oil flow':
    data = np.load('data/three_phase_oil_flow.npz')
    Y = data['Y']
    labels = data['labels']
elif dataset == 'USPS digits':
    data = sio.loadmat('data/usps_all.mat')
    N = 500
    Y = data['data'][:,0:N,:].T.reshape(-1,256)/256
    del data
    labels = np.array([x for x in [1,2,3,4,5,6,7,8,9,0] for _ in range(0,N)])
    digit_images = np.array([y.reshape(16,16).T for y in Y])
else:
    raise Exception(f'Unknown Dataset {dataset}')
n,D = Y.shape

experiment_folder = experiment_folder / dataset_path_friendly / experiment_key

In [None]:
random_seed = 42
def set_seed():
    np.random.seed(random_seed)
    tf.random.set_random_seed(random_seed)
set_seed()

In [None]:
if save_or_rerun in ['save','rerun']:
    if dataset == 'oil flow':
        analytic_kernel = (lambda: gpflow.kernels.RBF(5, ARD=True), 5)
        kernels = [
#             analytic_kernel,
            (lambda: gpflow.kernels.Matern32(10, ARD=True), 10),
        ]
    elif dataset == 'USPS digits':
        analytic_kernel = (lambda: gpflow.kernels.RBF(5, ARD=True), 5)
        kernels = [
          analytic_kernel,
          (lambda: library.kernels.MLP(5,[30,60]), 5),
        ]
    else:
        raise Exception('Unknown dataset')
    
    model_descriptions = [
        (analytic_kernel, AnalyticExpectation()),
        *[(k, UnscentedExpectation()) for k,_ in kernels],
        *[(k, GaussHermiteExpectation(hermite_points)) for k,_ in kernels],
        *[(k, expt, alpha, montecarlo_runs)
            for alpha in [0.01]
            for k,Q in kernels
            for expt in [MonteCarloExpectation(points) for points in {2*Q,hermite_points**Q,200}]
         ],
    ]

    gplvm_models = {}
    for model_description in model_descriptions:
        kernel_maker, expectation = model_description[:2]
        
        if len(model_description) == 2:
            alpha, runs = 0, 1
        else:
            alpha, runs = model_description[2:]
        
        for run in range(runs):
            set_seed()
            kernel = kernel_maker()      
            Q = kernel.input_dim
            X_mean = gpflow.models.PCA_reduce(Y, Q)
            set_seed()
            Z = np.random.permutation(X_mean.copy())[:m]
            with gpflow.defer_build():
                model = GPLVM(expectation, X_mean=X_mean, X_var=variational_variance*np.ones((n, Q)), Y=Y, kern=kernel, M=m, Z=Z)
                if dataset == 'oil flow':
                    model.likelihood.variance = 0.01
                elif dataset == 'USPS digits':
                    model.likelihood.variance = 0.001
                model._name = expectation.__name__.replace('(','-').replace(')','') + '-' + str(run)
            model.build()
            gplvm_models[(expectation.__name__, kernel_name(kernel), alpha, run)] = model
    gplvm_models = pd.Series(gplvm_models)
    gplvm_models.index.set_names(['expectation', 'kernel', 'alpha', 'run'], inplace=True)

    columns = gplvm_models.index.insert(0, ('analytic','PCA',0,0))

In [None]:
# Fit GPLVMs
bar = ipd.ProgressBar(len(gplvm_models))
ipd.display(ipd.HTML('<h4>Progress:</h4>'))
bar.display()

for key, model in gplvm_models.items():
    key = dict(zip(gplvm_models.index.names, key))
    print(key,'@',datetime.datetime.now().strftime("%I:%M %p"))

    if key['run'] == 0:
        set_seed()

    if key['alpha'] > 0:
        opt = gpflow.train.AdamOptimizer(key['alpha'])
    else:
        opt = gpflow.train.ScipyOptimizer()

    tf.logging.set_verbosity(tf.logging.INFO)
    %time opt.minimize(model, maxiter=5000)
    tf.logging.set_verbosity(tf.logging.FATAL)
    bar.progress += 1
print('Finished @', datetime.datetime.now().strftime("%I:%M %p"))

In [None]:
latent_space = pd.DataFrame(
    index=range(len(Y)),
    columns=pd.MultiIndex.from_tuples(
        (
            (*column, m, i)
            for column in columns
            for m in ['mean', 'variance']
            for i in range(2) if not (column[1] == 'PCA' and m == 'variance')
        ),
        names=[*columns.names, 'stat', 'dim']
    ),
    dtype=float
).sort_index(axis=1)

latent_space.loc[:, pd.IndexSlice['analytic', 'PCA', 0, 0, 'mean',:]] = gpflow.models.PCA_reduce(Y, 2)

for key, model in gplvm_models.items():
    kern = model.kern
    if isinstance(kern, gpflow.kernels.Stationary):
        sensibility = np.sqrt(kern.lengthscales.value)/kern.lengthscales.value
        dims = np.argsort(sensibility)[[-1, -2]]
        latent_space[(*key, 'mean')] = model.X_mean.value[:, dims]
        latent_space[(*key, 'variance')] = model.X_var.value[:, dims]
    elif type(kern) is library.kernels.MLP:
        assert kern.layers[0] == 2
        dims = [0,1]
        latent_space[(*key, 'mean')] = model.X_mean.value[:, dims]
        latent_space[(*key, 'variance')] = model.X_var.value[:, dims]
    else:
        raise Exception(f'Unknown kernel: {type(kern)}')

if save_or_rerun == 'save':
    points_folder = experiment_folder / 'points'
    points_folder.mkdir(parents=True)
    latent_space.to_hdf(points_folder/f'{dataset_path_friendly}.hdf', key='points')

In [None]:
folds = 5

accuracy = pd.DataFrame(
    index=pd.RangeIndex(folds, name='fold'),
    columns=columns,
    dtype=float
).sort_index(axis=1)

missed = pd.DataFrame(
    index=pd.MultiIndex.from_product([range(folds), range(len(Y))], names=['fold', None]),
    columns=accuracy.columns,
    dtype=bool
).sort_index(axis=1)


kf = sk.model_selection.KFold(folds, shuffle=True, random_state=random_seed)
for i, (trainIdx, testIdx) in enumerate(kf.split(labels)):
    y_train, y_test = labels[trainIdx], labels[testIdx]
    for column in columns:
        knn = sk.neighbors.KNeighborsClassifier(n_neighbors=1, metric='minkowski')
        train, test = latent_space.loc[trainIdx,(*column, 'mean')], latent_space.loc[testIdx,(*column, 'mean')]
        knn.fit(train, y_train)
        predicted_labels = knn.predict(latent_space[(*column, 'mean')])
        missed.loc[pd.IndexSlice[i,:],column] = (labels != predicted_labels)
        y_pred = knn.predict(test)
        accuracy.at[i,column] = sk.metrics.accuracy_score(y_test, y_pred)

accuracy = accuracy.stack('run')
if save_or_rerun == 'save':
    tables_folder = experiment_folder / 'tabs'
    tables_folder.mkdir(parents=True)
    (accuracy.describe().loc[['mean','std']] * 100).T.to_csv(tables_folder/f'acc_{dataset_path_friendly}_mean.csv')
    accuracy.to_csv(tables_folder/f'acc_{dataset_path_friendly}.csv')

print('Finished @', datetime.datetime.now().strftime("%I:%M %p"))
ipd.display((accuracy.describe().loc[['mean','std']] * 100).T)

In [None]:
for key in columns.droplevel(-1).drop_duplicates():
    scores = accuracy[key].sort_values()
    figname = '_'.join(str(x) for x in key[1:]).replace(' ','')
    
    best_fold, best_run = accuracy.index[len(scores)//2]
    points = latent_space[key][best_run]['mean'].values
    misses = missed[key][best_run][best_fold].values

    f, ax = plt.subplots(1,1, figsize=(10,10))
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)

    if dataset == 'oil flow':
        colors = plt.get_cmap('tab10')(range(len(np.unique(labels))))
        markers = ['o','X',',']
        for j, c in zip(np.unique(labels), colors):
            ax.scatter(points[labels==j, 0], points[labels==j, 1], color=c, label=j, marker=markers[j], s=200)
            ax.scatter(points[misses, 0], points[misses, 1], facecolors='none', edgecolors='r', linewidths=2, s=450)
    elif dataset == 'USPS digits':
        drawn_miss, miss_boxes = plot_image_scatter(ax, points[misses],digit_images[misses], color=(255,0,0))
        drawn_not, _ = plot_image_scatter(ax, points[~misses],digit_images[~misses], drawn_boxes=miss_boxes)

    f.tight_layout()
    figs_folder = experiment_folder / 'figs' / key[0]
    figs_folder.mkdir(parents=True, exist_ok=True)
    plt.savefig(figs_folder/f'{figname}.pdf', bbox_inches='tight', transparent=True)
    plt.close(f)