# Gaussian Process Modeling

This notebook will run various query seleciton methods and evaluate performance using a Gaussian Process Model

In [1]:
#Loading in packages
import numpy as np
import pandas as pd
import GPy
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.utils import shuffle
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
# Loading in data and subsetting columns
data = pd.read_csv("TYK2_final.csv")
data = data.drop(['target', 'top_2p', 'top_5p'], axis=1)
column_names = ['smiles', 'target']
data.columns = column_names
data

Unnamed: 0,smiles,target
0,C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)N)Cl,5.608397
1,C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC(...,7.972925
2,C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC(...,6.731267
3,C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC3...,7.653882
4,C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC3...,6.562022
...,...,...
9992,c1cncnc1Nc2cc(c(cn2)F)NC(=O)c3c(cc(cc3Cl)CF)Cl,7.232871
9993,c1cncnc1Nc2cc(c(cn2)F)NC(=O)c3c(cc(cc3Cl)CO)Cl,7.230769
9994,c1cncnc1Nc2cc(c(cn2)F)NC(=O)c3c(cc(cc3Cl)Cl)Cl,7.756025
9995,c1cncnc1Nc2cc(c(cn2)F)NC(=O)c3c(cc(cc3Cl)N)Cl,9.215634


In [3]:
#Converting to fingerprints
def smiles_to_fingerprint(smiles, nBits=4096):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=4, nBits=nBits, useChirality=True)
    return list(fp)

data['fingerprint'] = data['smiles'].apply(smiles_to_fingerprint)

print(data.head())

                                              smiles    target  \
0            C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)N)Cl  5.608397   
1  C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC(...  7.972925   
2  C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC(...  6.731267   
3  C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC3...  7.653882   
4  C=Cc1cc(c(c(c1)Cl)C(=O)Nc2cc(ncc2F)NC(=O)C3CC3...  6.562022   

                                         fingerprint  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  


In [4]:
data.to_csv("TYK2_fingerprints.csv")

# Random Sampling

To perform random sampling, we will randomly select 80% of the instances from the dataset to train on, and evaluate with the remaining 20%

In [19]:
def random_sampling(batch_size):
    seeds = [1, 2, 3]

    rmse_vals = []
    r2_vals = []
  
    for seed in seeds:
        rmse_instance =[]
        r2_instance = []
      
        #Getting X and y values to start with
        X = np.array([val for val in data['fingerprint'].values])
        y = data['target'].values.reshape(-1,1)
        X, y = shuffle(X,y, random_state=seed)

        size = 1000
        start_X = X[:size]
        start_y = y[:size]

        remaining_X = X[size:]
        remaining_y = y[size:]
        
        counter = 0
        while len(start_X) <= 1500:

            if counter % 50 == 0:
                #creating kernel
                k = GPy.kern.Linear(start_X.shape[1])

                #training and optimizing GP regression model
                m = GPy.models.GPRegression(start_X, start_y, k)
               
                #Predicting on final 20%
                pred_means, pred_vars = m.predict(remaining_X)

                #Getting rmse score
                rmse_instance.append(np.sqrt(mean_squared_error(remaining_y, pred_means)))
                r2_instance.append(r2_score(remaining_y, pred_means))
        
            #adding the row of the selected index to the starting data
            start_X = np.vstack((start_X, remaining_X[:batch_size]))
            start_y = np.vstack((start_y, remaining_y[:batch_size]))

            #removing the row of the selected index from the remaining data
            #Testing on remaining 20%
            remaining_X = remaining_X[batch_size:]
            remaining_y = remaining_y[batch_size:]
            
            counter += batch_size
        rmse_vals.append(rmse_instance)
        r2_vals.append(r2_instance)
        
    return rmse_vals, r2_vals

In [20]:
batch_sizes = [1, 25, 50]
rand_rmse_dict = {}
rand_r2_dict = {}


final_rand_data = pd.DataFrame()

for batch in batch_sizes:

    rand_rmse, rand_r2 = random_sampling(batch)

    rand_rmse_mean = np.mean(rand_rmse, axis=0)
    rand_rmse_stdev = np.std(rand_rmse, axis=0)
    print("mean:", rand_rmse_mean)
    print("standard deviation:", rand_rmse_stdev)
    rand_rmse_dict[batch] = (rand_rmse, rand_rmse_mean, rand_rmse_stdev)
    final_rand_data[f"{batch}_rmse_mean"] = rand_rmse_mean
    final_rand_data[f"{batch}_rmse_stdev"] = rand_rmse_stdev


    rand_r2_mean = np.mean(rand_r2, axis=0)
    rand_r2_stdev = np.std(rand_r2, axis=0)
    print("mean:", rand_r2_mean)
    print("standard deviation:", rand_r2_stdev)
    rand_r2_dict[batch] = (rand_r2, rand_r2_mean, rand_r2_stdev)
    final_rand_data[f"{batch}_r2_mean"] = rand_r2_mean
    final_rand_data[f"{batch}_r2_stdev"] = rand_r2_stdev


final_rand_data.to_csv("final_data_random.csv")

mean: [0.67851335 0.66577417 0.65735525 0.6493212  0.6450358  0.63415625
 0.63239482 0.62910644 0.62606269 0.62264844 0.61823679]
standard deviation: [0.0114247  0.00430973 0.00954599 0.00519653 0.0075443  0.0110923
 0.01172547 0.01033904 0.009591   0.0087983  0.01165945]
mean: [0.75157578 0.76091184 0.76655504 0.7721683  0.7750749  0.78249936
 0.78368823 0.78605724 0.78809443 0.79036235 0.79326152]
standard deviation: [0.00803293 0.00246722 0.0059438  0.00328136 0.00521593 0.00760325
 0.00801922 0.00727714 0.00695845 0.00649148 0.00860995]
mean: [0.67851335 0.66577417 0.65735525 0.6493212  0.6450358  0.63415625
 0.63239482 0.62910644 0.62606269 0.62264844 0.61823679]
standard deviation: [0.0114247  0.00430973 0.00954599 0.00519653 0.0075443  0.0110923
 0.01172547 0.01033904 0.009591   0.0087983  0.01165945]
mean: [0.75157578 0.76091184 0.76655504 0.7721683  0.7750749  0.78249936
 0.78368823 0.78605724 0.78809443 0.79036235 0.79326152]
standard deviation: [0.00803293 0.00246722 0.00594

# Uncertainity Sampling
## Mean, Variance, UCB

In [4]:
#This function runs sequential model-based optimization. This function works by training a sparse GP model on the start data, 
#and using a selection/activation function that performs UCB to select the data point to query.

#Input: starting data and unlabeled remaining data
#Output: Instance with the maximum GP mean upon prediction, as well as the actual value (if it is 9.0)

def smbo(start_X, start_y, remaining_X):
    
    k = GPy.kern.Linear(start_X.shape[1])

    m = GPy.models.GPRegression(start_X, start_y, k)

    mean, var = m.predict(remaining_X, full_cov=False)

    return mean, var

In [22]:
def uncertainty_sampling(flag, batch_size):
    seeds = [1]

    rmse_vals = []
    r2_vals = []
  
    for seed in seeds:

        fp = pd.DataFrame(columns=list(range(4096)))
        

        rmse_instance =[]
        r2_instance = []
      
        #Getting X and y values to start with
        X = np.array([val for val in data['fingerprint'].values])
        y = data['target'].values.reshape(-1,1)
        X, y = shuffle(X,y, random_state=seed)


        size = 1000
        start_X = X[:size]
        start_y = y[:size]

        remaining_X = X[size:]
        remaining_y = y[size:]

        #set initial variables for calculating UCB
        Dsize = len(X)
        bo_lambda = 0.1 #ADJUST LATER
        bo_iters = 1 #ADJUST LATER

        #calculate beta constant from 
        beta = 2 * math.log(Dsize * math.pow(bo_iters,2) * math.pow(np.pi,2) / (6 * bo_lambda) )
        print(beta)

        counter = 0

        #Until we sample another 70%...
        while len(start_X) <= 1500: #round(((len(X) * 0.2) / 500 )) * 500:

            #run smbo and get the sparse GP parameters to select the next instance
            mean, var = smbo(start_X, start_y, remaining_X)

            #depending on the selection function, we calculate a specific alpha_full value
            if flag == "ucb":
                #get the UCB value at each x
                alpha_full = mean + math.sqrt(beta) * var
                alpha_full = [item for sublist in alpha_full for item in sublist]
            elif flag == "mean":
                alpha_full = mean
                alpha_full = [item for sublist in alpha_full for item in sublist]
            else:
                # alpha_full = var # list of lists 
                alpha_full = [item for sublist in var for item in sublist]
    
            #get the index for the row with the largest UCB
            sorted = np.argsort(alpha_full)
            inds = sorted[-batch_size:]
            inds = np.sort(inds)[::-1]

            for ind in inds:
                #adding the row of the selected index to the starting data
                start_X = np.vstack((start_X, remaining_X[ind,:]))
                start_y = np.vstack((start_y, remaining_y[ind]))

                #cur_fp = remaining_X[ind,:].tolist()
                fp = pd.concat([fp, pd.DataFrame(remaining_X[ind,:].reshape(-1,1))])

                #removing the row of the selected index from the remaining data
                remaining_X = np.delete(remaining_X, ind, axis=0)
                remaining_y = np.delete(remaining_y, ind)


            if counter % 50 == 0:
                #creating kernel
                k = GPy.kern.Linear(start_X.shape[1])

                #training and optimizing GP regression model
                m = GPy.models.GPRegression(start_X, start_y, k)

                #Predicting on final 20%
                pred_means, pred_vars = m.predict(remaining_X)
                # print("plotting model predictions")

                #Getting rmse score
                rmse_instance.append(np.sqrt(mean_squared_error(remaining_y, pred_means)))
                r2_instance.append(r2_score(remaining_y, pred_means))

            counter += batch_size

        rmse_vals.append(rmse_instance)
        r2_vals.append(r2_instance)
        
    return rmse_vals, r2_vals, fp

In [23]:
batch_sizes = [50]
flag = "ucb"
ucb_rmse_dict = {}
ucb_r2_dict = {}
final_ucb_data = pd.DataFrame()

for batch in batch_sizes:

    ucb_rmse, ucb_r2, fp = uncertainty_sampling(flag, batch)

    ucb_rmse_mean = np.mean(ucb_rmse, axis=0)
    ucb_rmse_stdev = np.std(ucb_rmse, axis=0)
    print("mean:", ucb_rmse_mean)
    print("standard deviation:", ucb_rmse_stdev)
    ucb_rmse_dict[batch] = (ucb_rmse, ucb_rmse_mean, ucb_rmse_stdev)
    
    final_ucb_data[f"{batch}_rmse_mean"] = ucb_rmse_mean
    final_ucb_data[f"{batch}_rmse_stdev"] = ucb_rmse_stdev

    ucb_r2_mean = np.mean(ucb_r2, axis=0)
    ucb_r2_stdev = np.std(ucb_r2, axis=0)
    print("mean:", ucb_r2_mean)
    print("standard deviation:", ucb_r2_stdev)
    ucb_r2_dict[batch] = (ucb_r2, ucb_r2_mean, ucb_r2_stdev)
    final_ucb_data[f"{batch}_r2_mean"] = ucb_r2_mean
    final_ucb_data[f"{batch}_r2_stdev"] = ucb_r2_stdev

    fp.to_csv("fingerprints_ucb.csv")

#final_ucb_data.to_csv("final_data_ucb.csv")


24.020651444863944


KeyboardInterrupt: 

In [17]:
batch_sizes = [1, 25, 50]
flag = "mean"
mean_rmse_dict = {}
mean_r2_dict = {}

final_mean_data = pd.DataFrame()

for batch in batch_sizes:

    mean_rmse, mean_r2= uncertainty_sampling(flag, batch)

    mean_rmse_mean = np.mean(mean_rmse, axis=0)
    mean_rmse_stdev = np.std(mean_rmse, axis=0)
    print("mean:", mean_rmse_mean)
    print("standard deviation:", mean_rmse_stdev)
    mean_rmse_dict[batch] = (mean_rmse, mean_rmse_mean, mean_rmse_stdev)
    final_mean_data[f"{batch}_rmse_mean"] = mean_rmse_mean
    final_mean_data[f"{batch}_rmse_stdev"] = mean_rmse_stdev


    mean_r2_mean = np.mean(mean_r2, axis=0)
    mean_r2_stdev = np.std(mean_r2, axis=0)
    print("mean:", mean_r2_mean)
    print("standard deviation:", mean_r2_stdev)
    mean_r2_dict[batch] = (mean_r2, mean_r2_mean, mean_r2_stdev)
    final_mean_data[f"{batch}_r2_mean"] = mean_r2_mean
    final_mean_data[f"{batch}_r2_stdev"] = mean_r2_stdev


final_mean_data.to_csv("final_data_mean.csv")

24.020651444863944
24.020651444863944
24.020651444863944
mean: [0.67845357 0.67447308 0.66812086 0.66800833 0.66710053 0.66689913
 0.6656566  0.66424068 0.66093307 0.65968608 0.65882304]
standard deviation: [0.01155563 0.01219732 0.01256901 0.01216398 0.01096211 0.01160074
 0.00943003 0.0088856  0.00976811 0.01036883 0.00980504]
mean: [0.75135272 0.74647127 0.74599032 0.74094491 0.7370467  0.73239053
 0.72923834 0.72639511 0.72512922 0.72234419 0.71943569]
standard deviation: [0.0081153  0.00856536 0.00921265 0.009102   0.00806927 0.00849449
 0.00689882 0.00646998 0.00760497 0.00824478 0.00793708]
24.020651444863944
24.020651444863944
24.020651444863944
mean: [0.67742975 0.67188746 0.66969058 0.66756113 0.66560561 0.66533538
 0.66546206 0.66270378 0.66100294 0.65999263 0.65879524]
standard deviation: [0.01287797 0.01225493 0.01239483 0.01240643 0.01188406 0.01071635
 0.01018414 0.00979287 0.01026874 0.00973006 0.00948009]
mean: [0.74816757 0.74597296 0.74267438 0.73915535 0.73612955 0.

In [18]:
batch_sizes = [1, 25, 50]
flag = "var"
var_rmse_dict = {}
var_r2_dict = {}


final_var_data = pd.DataFrame()

for batch in batch_sizes:

    var_rmse, var_r2 = uncertainty_sampling(flag, batch)

    var_rmse_mean = np.mean(var_rmse, axis=0)
    var_rmse_stdev = np.std(var_rmse, axis=0)
    print("mean:", var_rmse_mean)
    print("standard deviation:", var_rmse_stdev)
    var_rmse_dict[batch] = (var_rmse, var_rmse_mean, var_rmse_stdev)
    final_var_data[f"{batch}_rmse_mean"] = var_rmse_mean
    final_var_data[f"{batch}_rmse_stdev"] = var_rmse_stdev


    var_r2_mean = np.mean(var_r2, axis=0)
    var_r2_stdev = np.std(var_r2, axis=0)
    print("mean:", var_r2_mean)
    print("standard deviation:", var_r2_stdev)
    var_r2_dict[batch] = (var_r2, var_r2_mean, var_r2_stdev)
    final_var_data[f"{batch}_r2_mean"] = var_r2_mean
    final_var_data[f"{batch}_r2_stdev"] = var_r2_stdev


final_var_data.to_csv("final_data_var.csv")

24.020651444863944
24.020651444863944
24.020651444863944
mean: [0.67819991 0.6596941  0.64504371 0.63949791 0.63199611 0.62300157
 0.61664977 0.60738197 0.60327419 0.59863299 0.5946769 ]
standard deviation: [0.01164893 0.00730648 0.00946949 0.00944724 0.01016513 0.00913082
 0.00925947 0.0112312  0.01093421 0.00484561 0.00470246]
mean: [0.75180788 0.76343433 0.77249823 0.7754492  0.77999436 0.78589229
 0.78999894 0.79583891 0.79826274 0.8007928  0.80298902]
standard deviation: [0.00818068 0.00495074 0.00689047 0.0070961  0.00749665 0.00675914
 0.00689691 0.0077425  0.00743957 0.0038242  0.00366888]
24.020651444863944
24.020651444863944
24.020651444863944
mean: [0.67043076 0.66289466 0.66160931 0.65919448 0.64734462 0.64251213
 0.6368117  0.63406883 0.62517979 0.62050684 0.61389138]
standard deviation: [0.00651003 0.00675645 0.00877218 0.00517342 0.0075031  0.0093519
 0.01093841 0.00890461 0.00818736 0.00749722 0.00852374]
mean: [0.75674087 0.76129138 0.7608251  0.76183175 0.76911825 0.7