In [1]:
import torch
torch.set_grad_enabled(False)
import sys
import time
import random
import csv
import numpy as np
import threading


import scipy.io
from scipy.io import loadmat

import pickle

from tqdm import tqdm
from tqdm import notebook

import matplotlib
import matplotlib.pyplot as plt

import importlib
import utils

import copy
# Enable autoreload
%load_ext autoreload
%autoreload 2
importlib.reload(utils)

# Set seeds for reproducibility
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
if torch.cuda.is_available():
    torch.cuda.manual_seed(0)
    torch.cuda.manual_seed_all(0)  # if you are using multi-GPU.
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark     = False

TORCH_DTYPE = torch.float64 #NB: Basically all of the matrices in Spatial_GP have 1.e-7 added to the diagonal, to be changed if we want to use float64
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
torch.set_default_dtype(TORCH_DTYPE)
torch.set_default_device(device)
print(f'Device is: {device}')




Using device: cuda:0 (from utils.py)
Using device: cuda:0 (from utils.py)
Device is: cuda:0


### In this notebook we evaluate the performance on of the active loop compared to the randomly picked images one.
To compare two models we dont use the r2 score but their loglikeligood value. 
As a reference, we have a model trained on the "whole" dataset, of around 2200 images. 
The remaining 1000 are kept as a test set. 
This model is trained with ntilde=ntrain so we expect the maximum performance on it.


Our active model will instead start with 50=ntilde=ntrain and increment the number of images based on their utility until ntilde=300.
The model will then try to calculate the loglikelihood at each iteration ( number of inducing points ) of the same test set of 1000 images. 
To have a good estimation of its performance on the **fixed 1000 imgs test set** we should run 10 different iterations of the above 2 lines, **changing the starting 50 inducing points**



### Parameters of the training

In [2]:
rand_xtilde = True # If True, xtilde (inducing points) are chosen randomly, if False, xtilde is chosen from the first ntilde images

cellid       = 8         # Choose cell
ntrain_start = 50        # Number of first training data points

kernfun      = 'acosker' # Choose kernel function

nEstep       = 10         # Total number of E-steps iterations.
nFparamstep  = 10  
nMstep       = 0         # Total number of M-steps iterations. 
maxiter      = 20         # Iterations of the optimization algorithm comprising M and E steps

ntilde       = ntrain_start
n_test_lk    = 1000       # Number of test data points on which to compute log-likelihood, to compare 2 models without using the r2 score

n_IC_inducing_points = 30 # Number of initial sets of starting inducing points from which to start the optimization algorithm
n_added_imgs         = 200


### Import dataset and generate starting dataset

Create starting dataset on which to train with m step with ntilde = ntrain_start

In [3]:
# region ________ Load the data to fixed datased vectors that dont change ever during the loop, pick the cell__________
with open('/home/idv-eqs8-pza/IDV_code/Variational_GP/spatial_GP/Data/data2_41mixed_tr28.pkl', 'rb') as file:
    # Load the data from the file
    loaded_data = pickle.load(file)
    # loaded_data is a Dataset object from module Data with attributes "images_train, _val, _test" as well as responses

X_train = torch.tensor(loaded_data.images_train).to(device, dtype=TORCH_DTYPE) #shape (2910,108,108,1) where 108 is the number of pixels. 2910 is the amount of training points
X_val   = torch.tensor(loaded_data.images_val).to(device, dtype=TORCH_DTYPE)
X_test  = torch.tensor(loaded_data.images_test).to(device, dtype=TORCH_DTYPE) # shape (30,108,108,1) # nimages, npx, npx

R_train = torch.tensor(loaded_data.responses_train).to(device, dtype=TORCH_DTYPE) #shape (2910,41) 2910 is the amount of training data, 41 is the number of cells
R_val   = torch.tensor(loaded_data.responses_val).to(device, dtype=TORCH_DTYPE)
R_test  = torch.tensor(loaded_data.responses_test).to(device, dtype=TORCH_DTYPE) # shape (30,30,42) 30 repetitions, 30 images, 42 cells

# Create the complete dataset
X = torch.cat( (X_train, X_val), axis=0,) #shape (3160,108,108,1)
R = torch.cat( (R_train, R_val), axis=0,)

n_px_side = X.shape[1]  

# Reshape images to 1D vector and choose a cell
X = torch.reshape(X, ( X.shape[0], X.shape[1]*X.shape[2])) 
R = R[:,cellid] # shape (nt,) where nt is the number of trials

# endregion

# region ________ Set the FIXED test set indexes __________

# Choose a random subset of the data and save the idx
all_idx  = torch.arange(0, X.shape[0])                     # Indices of the whole dataset  
torch.manual_seed(0)                                       # This seed controls which test set 1000 is chosen
torch.cuda.manual_seed(0)
all_idx_perm  = torch.randperm(all_idx.shape[0])           # Random permutation of the indices
test_1000_idx = all_idx_perm[-n_test_lk:]                  # Take the last 1000 images out of the set
all_idx_perm  = all_idx_perm[~torch.isin( all_idx_perm, test_1000_idx )] # Remove the test set indices from the permutation

# endregion

# region ________ Loop to generate always different starting inducing points __________

# Variables common to all initial inducing points
loglk_test_1000 = torch.zeros((n_test_lk, n_added_imgs), dtype=torch.float16) # tensor of loglikelihoods test values for each initial inducing set, for each added image
threads = []                                                                  # list of threads to calculate the loglikelihoods in parallel
for s in range(n_IC_inducing_points):
    print(f" ====== Starting set of inducing points number {s} out of {n_IC_inducing_points} ======")
    
    # region ________ Set the starting training set indexes and extract the actual vectors__________
    # This line is different in this notebook compared to the simple active _training one. There, one permutation of the idx is sufficent, while here
    # for each permutation that generates the 1000 test set , we want 10 that generate the starting xtilde set.
    # all_idx_perm is therefore the randomly ordered index collection of the training set

    torch.manual_seed(0)                                       # This seed controls which initial inducing points are chosen
    torch.cuda.manual_seed(0)
    start_idx      = all_idx_perm[:ntrain_start]               # These will be the indices of the initial training. This way ensures not repetitions.

    in_use_idx    = start_idx
    xtilde_idx    = in_use_idx
    remaining_idx = all_idx_perm[~torch.isin( all_idx_perm, in_use_idx )]

    # Set the starting set
    xtilde_start  = X[xtilde_idx,:]                           # In the simplest case the starting points are all inducing points
    X_in_use      = X[in_use_idx,:]
    X_remaining   = X[remaining_idx,:]
    X_test_1000   = X[test_1000_idx,:]

    R_remaining   = R[remaining_idx]
    R_in_use      = R[in_use_idx]
    R_test_1000   = R[test_1000_idx]
    # endregion

    # region ________ Set the hyperparameters and fit the starting model __________
    # For details on the hyperparameters choice see one_cell_fit.ipynb
    logbetaexpr = utils.fromlogbetasam_to_logbetaexpr( logbetasam=torch.tensor(5.5) )# Logbetaexpr in this code is equal to logbeta in Samuele's code. Samuele's code set logbeta to 5.5
    logrhoexpr  = utils.fromlogrhosam_to_logrhoexpr( logrhosam=torch.tensor(5)) 
    # logbetaexpr = torch.tensor(4.65)
    # logrhoexpr = torch.tensor(4.3)
    sigma_0    = torch.tensor(1.0)
    Amp        = torch.tensor(1.0) 
    eps_0x     = torch.tensor(0.0001)
    eps_0y     = torch.tensor(0.0001)
    theta = {'sigma_0': sigma_0, 'Amp': Amp, 'eps_0x':eps_0x, 'eps_0y':eps_0y, '-2log2beta': logbetaexpr, '-log2rho2': logrhoexpr,  }

    # Set the gradient of the hyperparemters to be updatable 
    for key, value in theta.items():
        theta[key] = value.requires_grad_()

    hyperparams_tuple = utils.generate_theta( x=X_in_use, r=R_in_use, n_px_side=n_px_side, display=False, **theta)

    A        = torch.tensor(0.01)
    logA     = torch.log(A)
    lambda0  = torch.tensor(1.)
    f_params = {'logA': logA, 'lambda0':lambda0}
    f_params['logA'] = f_params['logA'].requires_grad_() # The optimal lambda0 is given with a fixed A.

    fit_parameters = {'ntilde':      ntrain_start,
                    'maxiter':     maxiter,
                    'nMstep':      nMstep,
                    'nEstep':      nEstep,
                    'nFparamstep': nFparamstep,
                    'kernfun':     kernfun,
                    'cellid':      cellid,
                    'n_px_side':   n_px_side,
                    'in_use_idx':  in_use_idx,     # Used idx for generating xtilde, referred to the whole X dataset
                    'xtilde_idx':  xtilde_idx,     # Used idx for generating the complete set, referred to the whole X dataset
                    'start_idx':   start_idx }     # Indexes used to generate the initial training set, same as the starting xtilde

    init_model = {
            'fit_parameters':    fit_parameters,
            'xtilde':            xtilde_start,
            'hyperparams_tuple': hyperparams_tuple,    
            'f_params':          f_params,
        }

    # Fit the starting model, with the given initial inducing points
    start_model, err_dict = utils.varGP(X_in_use, R_in_use, **init_model)

    if err_dict['is_error']:
        print('Error in the fit of the starting model')
        raise err_dict['error']

    # utils.save_model(start_model, f'models/starting_models_active_learning/cell:{cellid}_nstart:{ntrain_start}', additional_description='Starting model for active learning')

    print(f"====== Starting model for initial inducing set number: {s} ======")
    spk_count_test, spk_count_pred, r2, sigma_r2 = utils.test(X_test, R_test, X_train=X, at_iteration=None, **start_model )

    # endregion

    active_model = copy.deepcopy(start_model) 

    # Parameters for each active loop
    saved_time = [0]   # Time saved by threading the loglikelihood test calculation
    for i in range(n_added_imgs):

        # region __________ Retreive the values from the last model fit __________
        in_use_idx    = active_model['fit_parameters']['in_use_idx']
        xtilde_idx    = active_model['fit_parameters']['xtilde_idx']

        remaining_idx = all_idx_perm[~torch.isin( all_idx_perm, in_use_idx )]

        X_remaining = X[remaining_idx]
        R_remaining = R[remaining_idx]

        xtilde = X[xtilde_idx]  # This has to be the same as start_model['xtilde']

        xstar  = X_remaining

        # kernfun       = active_model['fit_parameters']['kernfun']
        # if kernfun == 'acosker': kernfun = utils.acosker
        n_px_side     = active_model['fit_parameters']['n_px_side']
        EIGVAL_TOL    = active_model['fit_parameters']['eigval_tol']

        final_kernel  = active_model['final_kernel']
        
        mask          = active_model['mask']
        C             = active_model['C']
        B             = active_model['B']
        K_tilde_b     = active_model['K_tilde_b']
        K_tilde_inv_b = active_model['K_tilde_inv_b']
        K_b           = active_model['K_b']
        Kvec          = active_model['Kvec']
        m_b           = active_model['m_b']
        V_b           = active_model['V_b']    
        f_params      = active_model['f_params']
        theta         = active_model['hyperparams_tuple'][0]
        A             = torch.exp(f_params['logA'])
        lambda0       = torch.exp(f_params['loglambda0']) if 'loglambda0' in f_params else f_params['lambda0']
        
        theta_lower_lims  = active_model['hyperparams_tuple'][1]
        theta_higher_lims = active_model['hyperparams_tuple'][2]

        # endregion

        # region __________ Calculate the loglikelihood on the 1000 Test set __________

        def calculate_loglikelihood_thread():
            start_time_thread = time.time()
            Kvec_test = utils.acosker(theta, X_test_1000[:,mask], x2=None, C=C, dC=None, diag=True)
            K_test    = utils.acosker(theta, X_test_1000[:,mask], x2=xtilde[:,mask], C=C, dC=None, diag=False)
            K_test_b  = K_test @ B 

            lambda_m_t, lambda_var_t = utils.lambda_moments(X_test_1000[:,mask], K_tilde_b, K_test_b@K_tilde_inv_b, Kvec_test, K_test_b, C, m_b, V_b, theta)

            f_mean = utils.mean_f_given_lambda_moments(f_params, lambda_m_t, lambda_var_t)

            loglk_test_1000[s,i] = utils.compute_loglikelihood(R_test_1000, f_mean, lambda_m_t, lambda_var_t, f_params)[0] 

            print(f" loglk_test_1000 for inducing set n: {s} and new image n: {i} is {loglk_test_1000[s,i]} inside threading") 
            saved_time[0] = time.time() - start_time_thread

        thread = threading.Thread(target=calculate_loglikelihood_thread, )
        threads.append(thread)
        thread.start()
        # endregion

        # region __________ Calculate the utility of each remaining image __________
        # Calculate the matrices to compute the lambda moments. They are referred to the unseen images xstar
        Kvec_star = utils.acosker(theta, xstar[:,mask], x2=None, C=C, dC=None, diag=True)
        K_star    = utils.acosker(theta, xstar[:,mask], x2=xtilde[:,mask], C=C, dC=None, diag=False)
        K_star_b  = K_star @ B 

        lambda_m_t, lambda_var_t = utils.lambda_moments( xstar[:,mask], K_tilde_b, K_star_b@K_tilde_inv_b, Kvec_star, K_star_b, C, m_b, V_b, theta)  

        logf_mean = A*lambda_m_t + lambda0
        logf_var  = A**2 * lambda_var_t

        # Estimate the utility and cap the maximum r ( used in a summation to infinity )
        r_masked = torch.arange(0, 100, dtype=TORCH_DTYPE)
        u2d      = utils.nd_utility(logf_var, logf_mean, r_masked )

        i_best       = u2d.argmax()                 # Index of the best image in the utility vector
        x_idx_best   = remaining_idx[i_best]    # Index of the best image in the dataset indices
        # print(f'Utility: {u2d[i_best].item():<8.6f} |  Best image ID: {i_best}  | Best image index: {x_idx_best}')

        if x_idx_best in in_use_idx :
            raise ValueError('The best image is already in use or in the test set')
        if x_idx_best in test_1000_idx:
            raise ValueError('The best image is in the test set')
        
        # endregion

        # region __________ Update indices  __________ 
        in_use_idx    = torch.cat( (in_use_idx, x_idx_best[None]))
        remaining_idx = all_idx_perm[~torch.isin( all_idx_perm, in_use_idx )]

        X_in_use    = X[in_use_idx]
        R_in_use    = R[in_use_idx] 
        X_remaining = X[remaining_idx]
        R_remaining = R[remaining_idx]

        xtilde_idx     = in_use_idx
        ntilde         = xtilde_idx.shape[0]
        nt             = X_in_use.shape[0]
        # We add it as the last row. This is not ensured if we use X[xtilde_idx] instead of xtilde
        xtilde_updated = X[xtilde_idx]
        # xtilde_updated = torch.cat((xtilde, X[x_idx_best][None,:]), axis=0) 

        active_model['xtilde']                       = xtilde_updated
        active_model['fit_parameters']['ntilde']     = ntilde
        active_model['fit_parameters']['in_use_idx'] = in_use_idx
        active_model['fit_parameters']['xtilde_idx'] = xtilde_idx

        #endregion

        # region __________ Update variational parameters - TODO: Projection missing? __________
        # To update the variational parameters to the new dimensionality we need to pass through the original space. 
        # V and m will be projected onto the right eigenspace in varGP using the last used B.
        V = B @ V_b @ B.T    # shape (ntilde-1, ntilde-1)
        V = 0.5*(V + V.T)    # Ensure symmetry
        m = B @ m_b          # shape (ntilde-1,)

        V_new = torch.eye(ntilde, dtype=V_b.dtype, device=V_b.device)#*lambda_var_t[i_best]
        V_new[:ntilde-1, :ntilde-1] = V       

        active_model['V'] = V_new 
        active_model['m'] = torch.cat( (m, m.mean()[None]))

        # endregion

        # region __________ Update the kernel matrices __________
        # Update kernel matrices by only computing their latest column
        init_kernel = {}
        C                     = final_kernel['C']
        mask                  = final_kernel['mask']
        K_tilde_reduced       = final_kernel['K_tilde']            # We call it reduced because its still the Ktilde used in the last iteration
        K_reduced             = final_kernel['K']           
        # P                     = final_kernel['eigvecs']            # Projection matrix to the complete eigenspace

        #100 computattions of K_tilde this way take ~0.03s.
        K_tilde_column  = utils.acosker(theta, xtilde_updated[:,mask], xtilde_updated[-1,mask][None], C=C, dC=None, diag=False) 
        K_tilde         = torch.cat((K_tilde_reduced, K_tilde_column[:-1]), axis=1)
        K_tilde         = torch.cat((K_tilde, K_tilde_column.T), axis=0)  

        if ntilde==nt: K = K_tilde
        else: raise NotImplementedError('Fast calculation of K not implemented for ntilde != ntrain')

        Kvec            = utils.acosker(theta, X_in_use[:,mask],x2=None, C=C, dC=None, diag=True)   
        # endregion

        # region __________ Eigenvalue decomposition and projection __________
        eigvals, eigvecs = torch.linalg.eigh(K_tilde, UPLO='L')                                # calculates the eigenvals for an assumed symmetric matrix, eigenvalues  are returned in ascending order. Uplo=L uses the lower triangular part of the matrix. Eigenvectors are columns
        ikeep            = eigvals > max(eigvals.max() * EIGVAL_TOL, EIGVAL_TOL)                          # Keep only the largest eigenvectors
        B                = eigvecs[:, ikeep]                                     
        # make K_tilde_b and K_b a projection of K_tilde and K into the eigenspace of the largest eigenvectors
        K_tilde_b        = torch.diag(eigvals[ikeep])                    
        K_b              = K @ B                                         
        
        K_tilde_inv_b = torch.diag_embed(1/eigvals[ikeep])        
        KKtilde_inv_b = B if ntilde==nt else K_b @ K_tilde_inv_b
        # endregion

        # region __________ Save the updated kernel to a dict to feed to the model __________
        init_kernel['C']                 = C
        init_kernel['mask']              = mask
        init_kernel['K_tilde']           = K_tilde
        init_kernel['K']                 = K
        init_kernel['Kvec']              = Kvec
        init_kernel['B']                 = B
        init_kernel['K_tilde_b']         = K_tilde_b
        init_kernel['K_b']               = K_b
        init_kernel['K_tilde_inv_b']     = K_tilde_inv_b
        init_kernel['KKtilde_inv_b']     = KKtilde_inv_b

        active_model['init_kernel']      = init_kernel        # We update the model after copy because deepcopy doesnt work if 'init_kernel' is a key
        # endregion
        
        # region __________ Fit new model __________

        active_model, err_dict = utils.varGP(X_in_use, R_in_use, **active_model)

        if err_dict['is_error']:
            print(f'Error in the fit of the active model starting with inducing condition {s} during new image {i}') 
            raise err_dict['error']

        # spk_count_test, spk_count_pred, r2, sigma_r2 = utils.test(X_test, R_test, at_iteration=None, **active_model )

        # utils.plot_loss_and_theta_notebook(active_model, figsize=(8,3), marker='.')#ylim_logmarg=(0, 5000))

        # endregion
    





updated sigma_0 to 1.0000
updated Amp to 1.0000
updated eps_0x to 0.0001
updated eps_0y to 0.0001
updated -2log2beta to 4.8069
updated -log2rho2 to 4.3069
Initial Loss: 112.0424
Loss iter 1: 66.9005
 No M-step
Loss iter 2: 66.8978
 No M-step
Loss iter 3: 66.8971
 No M-step
Loss iter 4: 66.8970
 No M-step
Loss iter 5: 66.8970
 No M-step
Loss iter 6: 66.8970
 No M-step
Loss iter 7: 66.8970
 No M-step
Loss iter 8: 66.8970
 No M-step
Loss iter 9: 66.8970
 No M-step
Loss iter 10: 66.8970
 No M-step
Loss iter 11: 66.8970
 No M-step
Loss iter 12: 66.8970
 No M-step
Loss iter 13: 66.8970
 No M-step
Loss iter 14: 66.8970
 No M-step
Loss iter 15: 66.8970
 No M-step
Loss iter 16: 66.8970
 No M-step
Loss iter 17: 66.8970
 No M-step
Loss iter 18: 66.8970
 No M-step
Loss iter 19: 66.8970

Time spent for E-steps:       0.971s,
Time spent for f params:      0.719s
Time spent for m update:      0.252s
Time spent for M-steps:       0.000s
Time spent for All-steps:     0.971s
Time spent computing Kernels

In [1]:
loglk_test_1000

NameError: name 'loglk_test_1000' is not defined

### Random training loop