In [1]:
import numpy as np
import torch
import pandas as pd
import odbo
import os

## Get initial experiment design

In [24]:
# Load dataset
random_seed = 8
np.random.seed(random_seed)
data_test = pd.read_csv('../datasets/GB1_2016_149361.csv', sep=',')
name_pre, Y_test = np.array(data_test['AACombo']), np.array(data_test['Fitness'])
shuffle_order = np.arange(len(Y_test))
np.random.shuffle(shuffle_order[1:])
name_pre[1:], Y_test[1:] = name_pre[shuffle_order[1:]], Y_test[shuffle_order[1:]]
name = odbo.utils.code_to_array(name_pre)
if os.path.isfile('sele_experiment_GB1_2016.npy') == True:
    name_sele = np.load('sele_experiment_GB1_2016.npy')
    Y_train = np.load('sele_fitness_GB1_2016.npy')
    print('Selected initial experiments no. is ', len(Y_train))
else:
    # Let each site has 20 AA codes at least show up twice 
    sele_indices = odbo.initialization.initial_design(name, least_occurance=[2,2,2,2],verbose = False,random_state=random_seed)
    # Initial experiments are selected to be name_sele with fitness of Y_sele
    name_sele, Y_train = name[sele_indices, :], Y_test[sele_indices]
    print('Selected initial experiments no. is ', len(sele_indices))
print('Select max Y: ', Y_train.max())

Selected initial experiments no. is  40
Select max Y:  1.320616068


## Transform AA codes to average fitness feature 

In [27]:
# Using Max measurement results as initial features
threshold = max(0.05, np.mean(Y_train)-2*np.std(Y_train))
feature_model = odbo.featurization.MaxMeasurement(raw_vars=name_sele, Y=Y_train)
X_train = feature_model.transform(name_sele)
X_test = feature_model.transform(name)

## Random selection

In [25]:
sele_Y = list(np.random.choice(Y_test, 50, replace = False))
Y_train_sele = list(Y_train.copy())
Y_train_sele.extend(sele_Y)
print('Max Y', max(sele_Y))
np.save('results/GB1_2016/GB1_2016_random_{}.npy'.format(random_seed), Y_train_sele)

Max Y 3.978361133


## Naive BO

In [28]:
X_train_sele, Y_train_sele = torch.tensor(X_train), torch.tensor(Y_train.reshape(len(Y_train),1))
X_test_sele, Y_test_sele = torch.tensor(X_test), torch.tensor(Y_test.reshape(len(Y_test),1))
search_name_sele, name_sele_temp = name, name_sele

l, search_iter = 0, 50
gp_method='gp_regression'
batch_size = 1
failure_count = 0

while l < search_iter:
    print("Iter: ", l, "Current Max: ", Y_train_sele.max().detach().numpy(), "Test max: ", Y_test_sele.max().detach().numpy())
    X_next, acq_value, next_exp_id = odbo.bo_design(X=X_train_sele, Y=Y_train_sele, X_pending=X_test_sele, gp_method=gp_method, batch_size=batch_size)
    ids_keep = list(np.delete(range(X_test_sele.shape[0]), next_exp_id))
    X_train_sele, Y_train_sele = torch.cat([X_train_sele, X_test_sele[next_exp_id, :]]), torch.cat([Y_train_sele, Y_test_sele[next_exp_id]])
    X_test_sele, Y_test_sele = X_test_sele[ids_keep, :], Y_test_sele[ids_keep]
    name_sele_temp = np.concatenate((name_sele_temp, search_name_sele[next_exp_id]))
    search_name_sele = search_name_sele[ids_keep]
    feature_model1 = odbo.featurization.MaxMeasurement(raw_vars=X_train_sele, Y=Y_train_sele.detach().numpy())
    print("Newly added value: ", Y_train_sele[-batch_size:].detach().numpy(), name_sele_temp[-1])
    if Y_train_sele[-batch_size:].detach().numpy().max() <= Y_train_sele[:-batch_size].max():
        failure_count = failure_count + 1
    else:
        failure_count = 0
    if failure_count >= 3 and max_count < 3:
        max_count = max_count + 1
        feature_model1 = odbo.featurization.AvgMeasurement(raw_vars=X_train_sele, Y=Y_train_sele.detach().numpy())
    else:
        max_count = 0
    X_train_sele = torch.tensor(feature_model1.transform(X_train_sele))
    X_test_sele= torch.tensor(feature_model1.transform(X_test_sele))
    l = l + 1
np.save('results/GB1_2016/GB1_2016_BO_GP_batch1_{}.npy'.format(random_seed), Y_train_sele)

Iter:  0 Current Max:  1.320616068 Test max:  8.761965656
Newly added value:  [[0.31354108]] ['Q' 'V' 'G' 'A']
Iter:  1 Current Max:  1.320616068 Test max:  8.761965656
Newly added value:  [[0.00384758]] ['Q' 'D' 'A' 'V']
Iter:  2 Current Max:  1.320616068 Test max:  8.761965656
Newly added value:  [[0.01078996]] ['Q' 'V' 'A' 'V']
Iter:  3 Current Max:  1.320616068 Test max:  8.761965656
Newly added value:  [[0.66989285]] ['Q' 'D' 'A' 'A']
Iter:  4 Current Max:  1.320616068 Test max:  8.761965656
Newly added value:  [[4.50273083]] ['V' 'V' 'A' 'A']
Iter:  5 Current Max:  4.502730826 Test max:  8.761965656
Newly added value:  [[4.15798313]] ['V' 'D' 'A' 'A']
Iter:  6 Current Max:  4.502730826 Test max:  8.761965656
Newly added value:  [[1.76143518]] ['V' 'V' 'G' 'A']
Iter:  7 Current Max:  4.502730826 Test max:  8.761965656
Newly added value:  [[0.35189879]] ['V' 'V' 'A' 'V']
Iter:  8 Current Max:  4.502730826 Test max:  8.761965656
Newly added value:  [[4.682612]] ['V' 'I' 'A' 'A']
Ite

## TuRBO

In [25]:
X_train_sele, Y_train_sele = torch.tensor(X_train), torch.tensor(Y_train.reshape(len(Y_train),1))
search_name_sele, name_sele_temp = name, name_sele
X_test_sele, Y_test_sele = torch.tensor(X_test), torch.tensor(Y_test.reshape(len(Y_test),1))

l, search_iter = 0, 50
gp_method='gp_regression'
tr_length = [3.2]
batch_size = 1
failure_count = 0

state = odbo.turbo.TurboState(dim=X_train_sele.shape[1], batch_size=batch_size, length=tr_length, n_trust_regions=len(tr_length), failure_tolerance = 10)
state.best_value = Y_train_sele.max()
while l < search_iter:
    print("Iter: ", l, "Current Max: ", Y_train_sele.max().detach().numpy(), 'TR length: ', state.length, "Test max: ", Y_test_sele.max().detach().numpy())
    X_next, acq_value, raw_next_exp_id = odbo.turbo_design(state=state, X=X_train_sele, Y=Y_train_sele, X_pending=X_test_sele, n_trust_regions=len(tr_length), batch_size=batch_size, gp_method=gp_method)
    Y_next_m = torch.zeros((len(tr_length), batch_size, 1), device=Y_train_sele.device, dtype=Y_train_sele.dtype)
    next_exp_id = []
    for i in range(batch_size):
        next_exp_id_m = raw_next_exp_id[:, i]
        Y_next_m[:, i, 0], idtoadd = Y_test_sele[next_exp_id_m].reshape(len(tr_length)), next_exp_id_m[np.argmax(Y_test_sele[next_exp_id_m])]
        next_exp_id.append(idtoadd)
    X_train_sele, Y_train_sele = torch.cat([X_train_sele, X_test_sele[next_exp_id, :]]), torch.cat([Y_train_sele, Y_test_sele[next_exp_id]])
    ids_keep = list(np.delete(range(X_test_sele.shape[0]), next_exp_id))
    X_test_sele, Y_test_sele = X_test_sele[ids_keep, :], Y_test_sele[ids_keep]
    name_sele_temp = np.concatenate((name_sele_temp, search_name_sele[next_exp_id]))
    search_name_sele = search_name_sele[ids_keep]
    state = odbo.turbo.update_state(state=state, Y_next=Y_next_m)
    feature_model1 = odbo.featurization.AvgMeasurement(raw_vars=X_train_sele, Y=Y_train_sele.detach().numpy())
    print("Newly added value: ", Y_train_sele[-batch_size:].detach().numpy(), name_sele_temp[-1])

    if Y_train_sele[-batch_size:].detach().numpy().max() <= Y_train_sele[:-batch_size].max():
        failure_count = failure_count + 1
    else:
        failure_count = 0
    if failure_count >= 3 and max_count < 3:
        max_count = max_count + 1
        feature_model1 = odbo.featurization.MaxMeasurement(raw_vars=X_train_sele, Y=Y_train_sele.detach().numpy())
    else:
        max_count = 0
    X_train_sele = torch.tensor(feature_model1.transform(X_train_sele))
    X_test_sele= torch.tensor(feature_model1.transform(X_test_sele))
    l = l + 1

np.save('results/GB1_2016/GB1_2016_TuRBO_GP_batch1_{}.npy'.format(random_seed), Y_train_sele)

Iter:  0 Current Max:  1.320616068 TR length:  [3.2] Test max:  8.761965656
Newly added value:  [[0.31354108]] ['Q' 'V' 'G' 'A']
Iter:  1 Current Max:  1.320616068 TR length:  [3.2] Test max:  8.761965656
Newly added value:  [[0.12057613]] ['V' 'D' 'A' 'V']
Iter:  2 Current Max:  1.320616068 TR length:  [3.2] Test max:  8.761965656
Newly added value:  [[0.01078996]] ['Q' 'V' 'A' 'V']
Iter:  3 Current Max:  1.320616068 TR length:  [3.2] Test max:  8.761965656
Newly added value:  [[0.66989285]] ['Q' 'D' 'A' 'A']
Iter:  4 Current Max:  1.320616068 TR length:  [3.2] Test max:  8.761965656
Iter 10/500: 1.5446513738095973
Iter 20/500: 1.4153160039662027
Iter 30/500: 1.2045239509092989
Iter 40/500: -0.3578078307775326
Iter 50/500: -0.3759927166049044
Iter 60/500: -0.3769269174103771
Iter 70/500: -0.37760742971329414
The scipy optimizer and minimum inferred noises cannot make the kernel PSD, switch to torch optimizer
Newly added value:  [[4.50273083]] ['V' 'V' 'A' 'A']
Iter:  5 Current Max:  4