In [None]:
import gpflow
from gpflow.mean_functions import Constant
from gpflow.utilities import positive, print_summary
from gpflow.utilities.ops import broadcasting_elementwise
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from rdkit.Chem import AllChem, Descriptors, MolFromSmiles
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [None]:
import os
import pickle
from photocatalysis.learners_treesearch import ML_model, generate_ml_vectors

In [None]:
def transform_data(y_train, y_test):
    """
    Apply feature scaling to the data. Return the standardised train and
    test sets together with the scaler object for the target values.
    :param X_train: input train data
    :param y_train: train labels
    :param X_test: input test data
    :param y_test: test labels
    :return: X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, y_scaler
    """

    # x_scaler = StandardScaler()
    # X_train_scaled = x_scaler.fit_transform(X_train)
    # X_test_scaled = x_scaler.transform(X_test)

    y_scaler = StandardScaler()
    y_train_scaled = y_scaler.fit_transform(y_train)
    y_test_scaled = y_scaler.transform(y_test)

    return y_train_scaled, y_test_scaled, y_scaler

In [None]:
class Tanimoto(gpflow.kernels.Kernel):
    def __init__(self):
        super().__init__()
        # We constrain the value of the kernel variance to be positive when it's being optimised
        self.variance = gpflow.Parameter(1.0, transform=positive())

    def K(self, X, X2=None):
        """
        Compute the Tanimoto kernel matrix σ² * ((<x, y>) / (||x||^2 + ||y||^2 - <x, y>))
        :param X: N x D array
        :param X2: M x D array. If None, compute the N x N kernel matrix for X.
        :return: The kernel matrix of dimension N x M
        """
        if X2 is None:
            X2 = X

        Xs = tf.reduce_sum(tf.square(X), axis=-1)  # Squared L2-norm of X
        X2s = tf.reduce_sum(tf.square(X2), axis=-1)  # Squared L2-norm of X2
        outer_product = tf.tensordot(X, X2, [[-1], [-1]])  # outer product of the matrices X and X2

        # Analogue of denominator in Tanimoto formula

        denominator = -outer_product + broadcasting_elementwise(tf.add, Xs, X2s)

        return self.variance * outer_product/denominator

    def K_diag(self, X):
        """
        Compute the diagonal of the N x N kernel matrix of X
        :param X: N x D array
        :return: N x 1 array
        """
        return tf.fill(tf.shape(X)[:-1], tf.squeeze(self.variance))

-----

In [None]:
## Load data and OPTIMIZED model parameters
scratch_dir = '/localdisk/bt308495/'
scratch_fname = 'scratch_distance_matrix_70_30'

with open(os.path.join(scratch_dir, scratch_fname, 'ML_IP_70_30.pckl'), 'rb') as f:
    df_train, df_test, kip = pickle.load(f)

with open(os.path.join(scratch_dir, scratch_fname, 'ML_dGmax_70_30.pckl'), 'rb') as f:
    _, _, krdg = pickle.load(f)

with open('/home/btpq/bt308495/Thesis/frames/DF_COMPLETE_AFP.pckl', 'rb') as f:
    df_adsorb_fingerprint = pickle.load(f)

In [None]:
## Gather trainind and test data
# df_training = generate_ml_vectors(df_training)
df_test = generate_ml_vectors(df_test)

X_train = df_train.morgan_fp_bitvect.values
y_train_ip = df_train.IP.values
y_train_rdg = df_train.dGmax.values

X_test = df_test.morgan_fp_bitvect.values
y_test_ip = df_test.IP.values
y_test_rdg = df_test.dGmax.values

In [None]:
### Transform data
y_train_ip_scaled, y_test_ip_scaled, y_ip_scaler = transform_data(y_train_ip, y_test_ip)
y_train_rdg_scaled, y_test_rdg_scaled, y_rdg_scaler = transform_data(y_train_rdg, y_test_rdg)

In [None]:
### Instantiate Kernel and Model
K = Tanimoto()
m = gpflow.models.GPR(data=(X_train, y_train_rdg_scaled), mean_function=Constant(np.mean(y_train_rdg_scaled)), kernel=K, noise_variance=1)