In [46]:
import gpflow
from gpflow.mean_functions import Constant
from gpflow.utilities import positive, print_summary
from gpflow.utilities.ops import broadcasting_elementwise
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
# from rdkit.Chem import AllChem, Descriptors, MolFromSmiles
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

from src.contamination import Contamination
from src.latin_square import LatinSquare

In [123]:
problem = 'CON'
learner = 'NGBlin'
acqfun  = 'EI'
search  = 'MH'
epsilon = 0.0
n_exp   = np.arange(1,9)

header = '''#!/bin/bash
#$ -q teano
#$ -pe smp 10
#$ -j yes
#$ -cwd

# Load anaconda malware environment
conda activate base

# Run the executable
'''

tail = '''

# Deactivate anaconda environment
conda deactivate
'''

text = []
for i in n_exp:
    current = (f"python -u run.py --problem {problem} "  
               f"--learner {learner} --acqfun {acqfun} " 
               f"--niters 500 --search {search} " 
               f"--epsilon {epsilon} --nexp {i} " 
               f"> exp_{i}_{problem}_{learner}_o{search}_af{acqfun}.out")
    
    current = header + current + tail
    
    with open(f'experiment{i}.sub', 'w') as f:
        f.write(current)

In [119]:
header = '''#!/bin/bash
#$ -q teano
#$ -pe smp 10
#$ -j yes
#$ -cwd

# Load anaconda malware environment
conda activate base

# Run the executable
'''

tail = '''
# Deactivate anaconda environment
conda deactivate
'''

#!/bin/bash
#$ -q teano
#$ -pe smp 10
#$ -j yes
#$ -cwd

# Load anaconda malware environment
conda activate base

# Run the executable

# Deactivate anaconda environment
conda deactivate



In [47]:
class Tanimoto(gpflow.kernels.Kernel):
    def __init__(self):
        super().__init__()
        # We constrain the value of the kernel variance to be positive when it's being optimised
        self.variance = gpflow.Parameter(1.0, transform=positive())

    def K(self, X, X2=None):
        """
        Compute the Tanimoto kernel matrix σ² * ((<x, y>) / (||x||^2 + ||y||^2 - <x, y>))
        :param X: N x D array
        :param X2: M x D array. If None, compute the N x N kernel matrix for X.
        :return: The kernel matrix of dimension N x M
        """
        if X2 is None:
            X2 = X

        Xs = tf.reduce_sum(tf.square(X), axis=-1)  # Squared L2-norm of X
        X2s = tf.reduce_sum(tf.square(X2), axis=-1)  # Squared L2-norm of X2
        outer_product = tf.tensordot(X, X2, [[-1], [-1]])  # outer product of the matrices X and X2

        # Analogue of denominator in Tanimoto formula

        denominator = -outer_product + broadcasting_elementwise(tf.add, Xs, X2s)

        return self.variance * outer_product/denominator

    def K_diag(self, X):
        """
        Compute the diagonal of the N x N kernel matrix of X
        :param X: N x D array
        :return: N x 1 array
        """
        return tf.fill(tf.shape(X)[:-1], tf.squeeze(self.variance))
    
def transform_data(X_train, y_train, X_test, y_test):
    """
    Apply feature scaling to the data. Return the standardised train and
    test sets together with the scaler object for the target values.
    :param X_train: input train data
    :param y_train: train labels
    :param X_test: input test data
    :param y_test: test labels
    :return: X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, y_scaler
    """

    x_scaler = StandardScaler()
    X_train_scaled = x_scaler.fit_transform(X_train)
    X_test_scaled = x_scaler.transform(X_test)
    y_scaler = StandardScaler()
    y_train_scaled = y_scaler.fit_transform(y_train)
    y_test_scaled = y_scaler.transform(y_test)

    return X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, y_scaler

In [94]:
#opt = LatinSquare(n=2000)
opt = Contamination(n=2000, lamda=0.0001)
X = opt.X
y = opt.y

test_set_size = 0.2

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=0)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

#  We standardise the outputs but leave the inputs unchanged

_, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)

X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)

k = Tanimoto()
m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1)


In [96]:
opt = gpflow.optimizers.Scipy()
opt.minimize(m.training_loss, m.trainable_variables)
print_summary(m)

╒═════════════════════════╤═══════════╤══════════════════╤═════════╤═════════════╤═════════╤═════════╤══════════╕
│ name                    │ class     │ transform        │ prior   │ trainable   │ shape   │ dtype   │    value │
╞═════════════════════════╪═══════════╪══════════════════╪═════════╪═════════════╪═════════╪═════════╪══════════╡
│ GPR.mean_function.c     │ Parameter │ Identity         │         │ True        │ ()      │ float64 │ -2.73748 │
├─────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼──────────┤
│ GPR.kernel.variance     │ Parameter │ Softplus         │         │ True        │ ()      │ float64 │  1.35498 │
├─────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼──────────┤
│ GPR.likelihood.variance │ Parameter │ Softplus + Shift │         │ True        │ ()      │ float64 │  0.02286 │
╘═════════════════════════╧═══════════╧══════════════════╧═════════╧═════════════╧══════

In [97]:
y_pred, y_var = m.predict_f(X_test)
y_pred = y_scaler.inverse_transform(y_pred)
y_test = y_scaler.inverse_transform(y_test)

In [98]:
y_pred_train, _ = m.predict_f(X_train)
train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train)))
print("\nTrain RMSE (Standardised): {:.3f} nm".format(train_rmse_stan))
print("Train RMSE: {:.3f} nm".format(train_rmse))


# Output R^2, RMSE and MAE on the test set
score = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("\nTest R^2: {:.3f}".format(score))
print("Test RMSE: {:.3f} nm".format(rmse))
print("Test MAE: {:.3f} nm".format(mae))


Train RMSE (Standardised): 0.068 nm
Train RMSE: 0.033 nm

Test R^2: 0.893
Test RMSE: 0.161 nm
Test MAE: 0.126 nm


In [99]:
ranked_confidence_list = np.argsort(y_var, axis=0).flatten()
rmse_confidence_list = np.zeros((len(y_test) ))
mae_confidence_list = np.zeros((len(y_test) ))

for k in range(len(y_test)):

    # Construct the RMSE error for each level of confidence

    conf = ranked_confidence_list[0:k+1]
    rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf]))
    rmse_confidence_list[k] = rmse

    # Construct the MAE error for each level of confidence

    mae = mean_absolute_error(y_test[conf], y_pred[conf])
    mae_confidence_list[k] = mae

In [101]:
rmse_confidence_list

array([0.00819251, 0.02899606, 0.02497636, 0.16000292, 0.14567414,
       0.13298667, 0.14030063, 0.13340461, 0.1348033 , 0.1333208 ,
       0.13058016, 0.12897997, 0.14928521, 0.14397815, 0.14420755,
       0.14378376, 0.13984755, 0.14226892, 0.14685822, 0.14676362,
       0.14980991, 0.14678895, 0.14359473, 0.14081814, 0.1535156 ,
       0.15187388, 0.1498225 , 0.15518232, 0.15248332, 0.1531791 ,
       0.15159734, 0.153925  , 0.15161663, 0.15059126, 0.15404465,
       0.15215139, 0.15731668, 0.15744057, 0.15541381, 0.15364253,
       0.15458796, 0.15381623, 0.15581077, 0.15570536, 0.15485688,
       0.15358384, 0.1583823 , 0.16232103, 0.16804936, 0.16657359,
       0.16510623, 0.16418924, 0.16743004, 0.16629797, 0.1657063 ,
       0.16422226, 0.16445646, 0.16334954, 0.16451783, 0.16375416,
       0.16245935, 0.16214178, 0.16115942, 0.16085964, 0.15994537,
       0.16618483, 0.16510811, 0.163936  , 0.16577508, 0.16561974,
       0.16460005, 0.1658246 , 0.16569187, 0.16460938, 0.16351