In [32]:
import numpy as np
import pandas as pd
import GPy
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.utils import shuffle
import math
from sklearn.metrics import r2_score

In [4]:
# Loading in data and subsetting columns
data = pd.read_csv("TYK2_final.csv")
data = data.drop(['target', 'top_2p', 'top_5p'], axis=1)
column_names = ['smiles', 'target']
data.columns = column_names
data

Unnamed: 0,smiles,target
0,C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)N)Cl,5.608397
1,C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC(...,7.972925
2,C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC(...,6.731267
3,C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC3...,7.653882
4,C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC3...,6.562022
...,...,...
9992,c1cncnc1Nc2cc(c(cn2)F)NC(=O)c3c(cc(cc3Cl)CF)Cl,7.232871
9993,c1cncnc1Nc2cc(c(cn2)F)NC(=O)c3c(cc(cc3Cl)CO)Cl,7.230769
9994,c1cncnc1Nc2cc(c(cn2)F)NC(=O)c3c(cc(cc3Cl)Cl)Cl,7.756025
9995,c1cncnc1Nc2cc(c(cn2)F)NC(=O)c3c(cc(cc3Cl)N)Cl,9.215634


In [6]:
#Converting to fingerprints
def smiles_to_fingerprint(smiles, nBits=4096):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=4, nBits=nBits, useChirality=True)
    return list(fp)

data['fingerprint'] = data['smiles'].apply(smiles_to_fingerprint)

print(data.head())

                                              smiles    target  \
0            C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)N)Cl  5.608397   
1  C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC(...  7.972925   
2  C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC(...  6.731267   
3  C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC3...  7.653882   
4  C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC3...  6.562022   

                                         fingerprint  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  


In [18]:
#Getting X and y values to start with
X = np.array([val for val in data['fingerprint'].values])
y = data['target'].values.reshape(-1,1)
X, y = shuffle(X,y, random_state=42)

In [43]:
#Seeing what the initial accuracy would be if we just randomly sampled 1% of the data to train on

#Starting data with 10%
size = int(len(X) * 0.1)
start_X = X[:size]
start_y = y[:size]

#Testing on 90%
remaining_X = X[size:]
remaining_y = y[size:]

#creating kernel
k = GPy.kern.RBF(start_X.shape[1])

#training and optimizing GP regression model
m = GPy.models.GPRegression(start_X, start_y, k)
print("model trained")
m.optimize('bfgs', max_iters=10)
print("model optimized")

#Predicting on 90%
pred_means, pred_vars = m.predict(remaining_X)

#Getting r^2 score
r2_score(remaining_y, pred_means)

model trained
model optimized
[[6.91754886]
 [7.15006211]
 [6.18780918]
 ...
 [6.21177475]
 [6.87642268]
 [6.89914041]]


In [None]:
#This function runs sequential model-based optimization. This function works by training a sparse GP model on the start data, 
#and using a selection/activation function that performs UCB to select the data point to query.

#Input: starting data and unlabeled remaining data
#Output: Instance with the maximum GP mean upon prediction, as well as the actual value (if it is 9.0)

def smbo(start_X, start_y, remaining_X, remaining_y):
    
    k = GPy.kern.RBF(start_X.shape[1])

    m = GPy.models.SparseGPRegression(start_X, start_y, k)

    m.optimize('bfgs', max_iters=10)

    mean, var = m.predict(remaining_X, full_cov=False)

    return mean, var, m

In [45]:
#This block runs SMBO to select data points iteratively until we have 10% of the data to train a GP model on

#Randomly select 5% of the data to start training on
size = int(len(X) * 0.05)
start_X = X[:size]
start_y = y[:size]

#get the current unlabeled points
remaining_X = X[size:]
remaining_y = y[size:]

#set initial variables for calculating UCB
Dsize = len(X)
bo_lambda = 0.1
bo_iters = 1

#calculate beta constant from 
beta = 2 * math.log(Dsize * math.pow(bo_iters,2) * math.pow(np.pi,2) / (6 * bo_lambda) )

#Until we sample another 5%...
for i in range(size):

    #run smbo and get the sparse GP parameters to select the next instance
    mean, var, m = smbo(start_X, start_y, remaining_X, remaining_y)

    #get the UCB value at each x
    alpha_full = mean + math.sqrt(beta) * var

    #get the index for the row with the largest UCB
    ind = np.argmax(alpha_full)

    #adding the row of the selected index to the starting data
    start_X = np.vstack((start_X, remaining_X[ind,:]))
    start_y = np.vstack((start_y, remaining_y[ind]))

    #removing the row of the selected index from the remaining data
    remaining_X = np.delete(remaining_X, ind, axis=0)
    remaining_y = np.delete(remaining_y, ind)

    


In [46]:
#Evaluating the model after training on the 10% of data

#setup the kernel
k = GPy.kern.RBF(start_X.shape[1])

#training and optimizing a regular GPRegression model
m = GPy.models.GPRegression(start_X, start_y, k)
m.optimize('bfgs', max_iters=10)

#Get the predicted values
pred_means, pred_vars = m.predict(remaining_X)

#get the r^2 value
r2_score(remaining_y, pred_means)

model trained
model optimized
[[ 7.56773948]
 [10.22221204]
 [ 8.3164862 ]
 ...
 [ 5.79824554]
 [ 6.16710422]
 [ 6.84037037]]
