In [5]:
import gpflow
from gpflow.mean_functions import Constant
from gpflow.utilities import positive, print_summary
from gpflow.utilities.ops import broadcasting_elementwise
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
# from rdkit.Chem import AllChem, Descriptors, MolFromSmiles
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

from src.contamination import Contamination
from src.latin_square import LatinSquare
from src.rna import RNA

from src.models.GPr import GPr

In [6]:
from ngboost import NGBRegressor
from ngboost.distns import Exponential, Normal
from ngboost.scores import LogScore, CRPScore

from ngboost.distns.normal import Normal
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV


In [30]:
class Tanimoto(gpflow.kernels.Kernel):
    def __init__(self):
        super().__init__()
        # We constrain the value of the kernel variance to be positive when it's being optimised
        self.variance = gpflow.Parameter(1.0, transform=positive())

    def K(self, X, X2=None):
        """
        Compute the Tanimoto kernel matrix σ² * ((<x, y>) / (||x||^2 + ||y||^2 - <x, y>))
        :param X: N x D array
        :param X2: M x D array. If None, compute the N x N kernel matrix for X.
        :return: The kernel matrix of dimension N x M
        """
        if X2 is None:
            X2 = X

        Xs = tf.reduce_sum(tf.square(X), axis=-1)  # Squared L2-norm of X
        X2s = tf.reduce_sum(tf.square(X2), axis=-1)  # Squared L2-norm of X2
        outer_product = tf.tensordot(X, X2, [[-1], [-1]])  # outer product of the matrices X and X2

        # Analogue of denominator in Tanimoto formula

        denominator = -outer_product + broadcasting_elementwise(tf.add, Xs, X2s)

        return self.variance * outer_product/denominator

    def K_diag(self, X):
        """
        Compute the diagonal of the N x N kernel matrix of X
        :param X: N x D array
        :return: N x 1 array
        """
        return tf.fill(tf.shape(X)[:-1], tf.squeeze(self.variance))
    
def transform_data(X_train, y_train, X_test, y_test):
    """
    Apply feature scaling to the data. Return the standardised train and
    test sets together with the scaler object for the target values.
    :param X_train: input train data
    :param y_train: train labels
    :param X_test: input test data
    :param y_test: test labels
    :return: X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, y_scaler
    """

    x_scaler = StandardScaler()
    X_train_scaled = x_scaler.fit_transform(X_train)
    X_test_scaled = x_scaler.transform(X_test)
    y_scaler = StandardScaler()
    y_train_scaled = y_scaler.fit_transform(y_train)
    y_test_scaled = y_scaler.transform(y_test)

    return X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, y_scaler

In [31]:
#opt = LatinSquare(n=2000)
# opt = Contamination(n=300, lamda=0.0001)
opt = RNA(n=1000)
X = opt.X
y = opt.y

test_set_size = 0.2

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=0)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

#  We standardise the outputs but leave the inputs unchanged

_, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)

X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)

k = Tanimoto()
# k = gpflow.kernels.Matern32()
m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=0.00001)


In [33]:
opt = gpflow.optimizers.Scipy()
opt.minimize(m.training_loss, m.trainable_variables)
print_summary(m)

╒═════════════════════════╤═══════════╤══════════════════╤═════════╤═════════════╤═════════╤═════════╤══════════╕
│ name                    │ class     │ transform        │ prior   │ trainable   │ shape   │ dtype   │    value │
╞═════════════════════════╪═══════════╪══════════════════╪═════════╪═════════════╪═════════╪═════════╪══════════╡
│ GPR.mean_function.c     │ Parameter │ Identity         │         │ True        │ ()      │ float64 │ -0.99722 │
├─────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼──────────┤
│ GPR.kernel.variance     │ Parameter │ Softplus         │         │ True        │ ()      │ float64 │  2.39408 │
├─────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼──────────┤
│ GPR.likelihood.variance │ Parameter │ Softplus + Shift │         │ True        │ ()      │ float64 │  1e-05   │
╘═════════════════════════╧═══════════╧══════════════════╧═════════╧═════════════╧══════

In [34]:
y_pred, y_var = m.predict_f(X_test)
y_pred = y_scaler.inverse_transform(y_pred)
y_test = y_scaler.inverse_transform(y_test)

In [35]:
y_var.numpy().squeeze()

array([0.78266168, 0.80261353, 0.8811589 , 0.80032341, 0.87164075,
       0.86945012, 0.88038955, 0.87961391, 0.95517094, 0.82554276,
       0.85102612, 0.88036474, 0.85239289, 0.85386551, 0.87525606,
       0.85984803, 0.88130446, 0.86683995, 0.91083771, 0.8181584 ,
       0.80240671, 0.8208098 , 0.8546686 , 0.83895093, 0.82382609,
       0.83321954, 0.9537032 , 0.88989384, 0.85284603, 0.84747001,
       0.98942819, 0.78036291, 0.85539546, 0.94476962, 0.88246879,
       0.9882361 , 0.89834956, 0.84047742, 0.90968119, 0.9432345 ,
       0.96294753, 0.78024686, 0.88123101, 0.83376141, 0.86748997,
       0.77729689, 0.81022412, 0.82154936, 1.00239886, 0.9725462 ,
       0.95437696, 0.93585714, 0.81087595, 0.86235544, 0.96296771,
       0.86214374, 0.80105066, 0.85521435, 0.88712169, 0.86021316,
       0.8442158 , 0.92316797, 0.87991359, 0.86151994, 0.76644466,
       0.9142252 , 0.82623313, 0.82748306, 0.87261106, 0.84458986,
       0.8602869 , 0.94794593, 0.8109448 , 0.86647983, 0.91758

In [36]:
y_pred.squeeze()

array([13.04892292, 10.78615243, 10.72444565, 10.12021463, 12.3453905 ,
       11.18856491, 10.3033119 ,  9.98352129, 10.4683996 , 10.04157029,
       10.85947566, 11.02867724, 11.32343837,  9.77942099, 10.76820208,
       10.11062746, 10.26221325, 10.69199944, 10.7051988 , 11.37148986,
        9.67060808, 12.27346714, 10.30369801, 11.2751096 , 11.02930202,
       10.68601497, 10.48644965,  9.83057745, 12.43992551, 12.10621746,
        9.82731125, 10.71556187, 12.64530996, 11.03653349,  9.63327481,
       10.93671539, 12.16348385, 10.82148095, 10.80780047, 10.27564164,
       12.6161012 , 12.32822961, 10.36934088, 10.61866317, 11.55418495,
       12.65908975, 11.15397315,  9.89441975, 11.74789265, 11.60141024,
       10.72892818,  9.9033533 , 10.86715061, 11.25879706, 10.4248625 ,
        9.57520916, 11.44463111, 10.88973987, 10.58375976, 10.64099138,
       11.30239206, 10.54819336, 10.6269026 , 11.19185527,  9.47700827,
       10.31953912, 11.89532539, 10.84459111, 10.68997304, 11.58

In [37]:
y_pred_train, _ = m.predict_f(X_train)
train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train)))
print("\nTrain RMSE (Standardised): {:.3f} nm".format(train_rmse_stan))
print("Train RMSE: {:.3f} nm".format(train_rmse))


# Output R^2, RMSE and MAE on the test set
score = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("\nTest R^2: {:.3f}".format(score))
print("Test RMSE: {:.3f} nm".format(rmse))
print("Test MAE: {:.3f} nm".format(mae))


Train RMSE (Standardised): 0.000 nm
Train RMSE: 0.000 nm

Test R^2: 0.009
Test RMSE: 1.792 nm
Test MAE: 1.336 nm


In [24]:
ranked_confidence_list = np.argsort(y_var, axis=0).flatten()
rmse_confidence_list = np.zeros((len(y_test) ))
mae_confidence_list = np.zeros((len(y_test) ))

for k in range(len(y_test)):

    # Construct the RMSE error for each level of confidence

    conf = ranked_confidence_list[0:k+1]
    rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf]))
    rmse_confidence_list[k] = rmse

    # Construct the MAE error for each level of confidence

    mae = mean_absolute_error(y_test[conf], y_pred[conf])
    mae_confidence_list[k] = mae

In [25]:
rmse_confidence_list

array([0.32720557, 0.23349615, 0.28425053, 0.46789571, 0.45803177,
       0.41953734, 0.40448906, 0.39195142, 0.38403203, 0.41864875,
       0.40127959, 0.38476374, 0.3985938 , 0.38416665, 0.37140984,
       0.36127439, 0.3505675 , 0.34473185, 0.34376501, 0.33892446,
       0.3311232 , 0.33137435, 0.3488987 , 0.34190943, 0.33554708,
       0.32935321, 0.32620114, 0.32163998, 0.31622359, 0.3152653 ,
       0.31846924, 0.32091539, 0.32398087, 0.31922162, 0.31650347,
       0.31374797, 0.3095397 , 0.31094055, 0.31012004, 0.31179078,
       0.3094629 , 0.30595304, 0.30424809, 0.30186017, 0.29939892,
       0.29659863, 0.29950446, 0.3074105 , 0.30429667, 0.30123867,
       0.29915559, 0.3034244 , 0.30087118, 0.29886285, 0.29682502,
       0.29965957, 0.2972244 , 0.29548255, 0.29298284, 0.29054057])