In [1]:
import gpflow
from gpflow.mean_functions import Constant
from gpflow.utilities import positive, print_summary
from gpflow.utilities.ops import broadcasting_elementwise
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
# from rdkit.Chem import AllChem, Descriptors, MolFromSmiles
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

from src.contamination import Contamination
from src.latin_square import LatinSquare

from src.models.GPr import GPr

  warn(
  warn(


In [26]:
from ngboost import NGBRegressor
from ngboost.distns import Exponential, Normal
from ngboost.scores import LogScore, CRPScore

from ngboost.distns.normal import Normal
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV


In [11]:
class Tanimoto(gpflow.kernels.Kernel):
    def __init__(self):
        super().__init__()
        # We constrain the value of the kernel variance to be positive when it's being optimised
        self.variance = gpflow.Parameter(1.0, transform=positive())

    def K(self, X, X2=None):
        """
        Compute the Tanimoto kernel matrix σ² * ((<x, y>) / (||x||^2 + ||y||^2 - <x, y>))
        :param X: N x D array
        :param X2: M x D array. If None, compute the N x N kernel matrix for X.
        :return: The kernel matrix of dimension N x M
        """
        if X2 is None:
            X2 = X

        Xs = tf.reduce_sum(tf.square(X), axis=-1)  # Squared L2-norm of X
        X2s = tf.reduce_sum(tf.square(X2), axis=-1)  # Squared L2-norm of X2
        outer_product = tf.tensordot(X, X2, [[-1], [-1]])  # outer product of the matrices X and X2

        # Analogue of denominator in Tanimoto formula

        denominator = -outer_product + broadcasting_elementwise(tf.add, Xs, X2s)

        return self.variance * outer_product/denominator

    def K_diag(self, X):
        """
        Compute the diagonal of the N x N kernel matrix of X
        :param X: N x D array
        :return: N x 1 array
        """
        return tf.fill(tf.shape(X)[:-1], tf.squeeze(self.variance))
    
def transform_data(X_train, y_train, X_test, y_test):
    """
    Apply feature scaling to the data. Return the standardised train and
    test sets together with the scaler object for the target values.
    :param X_train: input train data
    :param y_train: train labels
    :param X_test: input test data
    :param y_test: test labels
    :return: X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, y_scaler
    """

    x_scaler = StandardScaler()
    X_train_scaled = x_scaler.fit_transform(X_train)
    X_test_scaled = x_scaler.transform(X_test)
    y_scaler = StandardScaler()
    y_train_scaled = y_scaler.fit_transform(y_train)
    y_test_scaled = y_scaler.transform(y_test)

    return X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, y_scaler

In [12]:
#opt = LatinSquare(n=2000)
opt = Contamination(n=300, lamda=0.0001)
X = opt.X
y = opt.y

test_set_size = 0.2

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=0)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

#  We standardise the outputs but leave the inputs unchanged

_, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)

X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)

k = Tanimoto()
# k = gpflow.kernels.Matern32()
m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=0.00001)


In [160]:
opt = gpflow.optimizers.Scipy()
opt.minimize(m.training_loss, m.trainable_variables)
print_summary(m)

╒═════════════════════════╤═══════════╤══════════════════╤═════════╤═════════════╤═════════╤═════════╤══════════╕
│ name                    │ class     │ transform        │ prior   │ trainable   │ shape   │ dtype   │    value │
╞═════════════════════════╪═══════════╪══════════════════╪═════════╪═════════════╪═════════╪═════════╪══════════╡
│ GPR.mean_function.c     │ Parameter │ Identity         │         │ True        │ ()      │ float64 │ -2.14777 │
├─────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼──────────┤
│ GPR.kernel.variance     │ Parameter │ Softplus         │         │ True        │ ()      │ float64 │  1.75273 │
├─────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼──────────┤
│ GPR.likelihood.variance │ Parameter │ Softplus + Shift │         │ True        │ ()      │ float64 │  1e-05   │
╘═════════════════════════╧═══════════╧══════════════════╧═════════╧═════════════╧══════

In [161]:
y_pred, y_var = m.predict_f(X_test)
y_pred = y_scaler.inverse_transform(y_pred)
y_test = y_scaler.inverse_transform(y_test)

In [165]:
y_var.numpy().squeeze()

array([0.33090509, 0.29323645, 0.27569958, 0.38293804, 0.55186894,
       0.27450514, 0.2212626 , 0.44540456, 0.21540746, 0.19931773,
       0.79404407, 0.30834906, 0.55250438, 0.26161385, 0.27008172,
       0.35400501, 0.23665507, 0.39911424, 0.24153515, 0.33799247,
       0.20009657, 0.21065968, 0.35196697, 0.35798925, 0.29759923,
       0.28474165, 0.45911288, 0.25661039, 0.54810662, 0.34935706,
       0.40646474, 0.16215061, 0.45060432, 0.2244177 , 0.29138425,
       0.37851139, 0.23634718, 0.3916373 , 0.23237996, 0.213299  ,
       0.25476047, 0.4281434 , 0.54622334, 0.35817801, 0.33660736,
       0.28664215, 0.28735927, 0.30046277, 0.31844842, 0.30786249,
       0.3742284 , 0.2792149 , 0.20159599, 0.32068754, 0.32576645,
       0.26963602, 0.27178021, 0.32344859, 0.22670815, 0.34615359])

In [167]:
y_pred.squeeze()

array([2.09073869, 1.93821278, 0.91175986, 1.505309  , 0.88965641,
       1.78275912, 1.79515858, 1.70707869, 2.17148008, 1.991912  ,
       0.89773583, 1.8760215 , 1.11139583, 1.31169085, 1.5700163 ,
       2.08098649, 1.85624904, 1.61443037, 2.36991535, 1.18689895,
       1.97695272, 1.69813417, 1.78279465, 1.40736081, 1.24530878,
       1.81928552, 0.92609013, 1.37285989, 1.09087017, 1.73976604,
       1.9695438 , 2.45363282, 1.27723201, 1.72117032, 2.03265628,
       1.44424161, 1.94960097, 1.12968579, 1.58115391, 1.76411155,
       1.34638383, 1.34069672, 1.13082344, 1.21581243, 1.51069622,
       2.13063891, 1.24830599, 1.85815959, 1.60402279, 1.66929078,
       1.47929668, 2.21103365, 2.19302947, 2.08984454, 1.21441771,
       1.90840089, 0.96639795, 1.75756909, 2.03785557, 1.46645775])

In [23]:
y_pred_train, _ = m.predict_f(X_train)
train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train)))
print("\nTrain RMSE (Standardised): {:.3f} nm".format(train_rmse_stan))
print("Train RMSE: {:.3f} nm".format(train_rmse))


# Output R^2, RMSE and MAE on the test set
score = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("\nTest R^2: {:.3f}".format(score))
print("Test RMSE: {:.3f} nm".format(rmse))
print("Test MAE: {:.3f} nm".format(mae))


Train RMSE (Standardised): 0.000 nm
Train RMSE: 0.000 nm

Test R^2: 0.615
Test RMSE: 0.291 nm
Test MAE: 0.229 nm


In [24]:
ranked_confidence_list = np.argsort(y_var, axis=0).flatten()
rmse_confidence_list = np.zeros((len(y_test) ))
mae_confidence_list = np.zeros((len(y_test) ))

for k in range(len(y_test)):

    # Construct the RMSE error for each level of confidence

    conf = ranked_confidence_list[0:k+1]
    rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf]))
    rmse_confidence_list[k] = rmse

    # Construct the MAE error for each level of confidence

    mae = mean_absolute_error(y_test[conf], y_pred[conf])
    mae_confidence_list[k] = mae

In [25]:
rmse_confidence_list

array([0.32720557, 0.23349615, 0.28425053, 0.46789571, 0.45803177,
       0.41953734, 0.40448906, 0.39195142, 0.38403203, 0.41864875,
       0.40127959, 0.38476374, 0.3985938 , 0.38416665, 0.37140984,
       0.36127439, 0.3505675 , 0.34473185, 0.34376501, 0.33892446,
       0.3311232 , 0.33137435, 0.3488987 , 0.34190943, 0.33554708,
       0.32935321, 0.32620114, 0.32163998, 0.31622359, 0.3152653 ,
       0.31846924, 0.32091539, 0.32398087, 0.31922162, 0.31650347,
       0.31374797, 0.3095397 , 0.31094055, 0.31012004, 0.31179078,
       0.3094629 , 0.30595304, 0.30424809, 0.30186017, 0.29939892,
       0.29659863, 0.29950446, 0.3074105 , 0.30429667, 0.30123867,
       0.29915559, 0.3034244 , 0.30087118, 0.29886285, 0.29682502,
       0.29965957, 0.2972244 , 0.29548255, 0.29298284, 0.29054057])