In [2]:
import numpy as np
import torch
import pandas as pd
import odbo
import os
import gc

## TurBO for next best experiment

In [3]:
random_seed = 0
np.random.seed(random_seed)
data_test = pd.read_csv('../datasets/GB1_2014_536944.csv', sep=',')
name_pre, Y_test = np.array(data_test['AACombo']), np.array(data_test['Fitness'])

del data_test
if os.path.isfile('sele_indices_GB1_2014.npy') == True:
    sele_indices = np.load('sele_indices_GB1_2014.npy')
    shuffle_order = np.load('shuffle_order_GB1_2014.npy')
    name_pre[1:], Y_test[1:] = name_pre[shuffle_order[1:]], Y_test[shuffle_order[1:]]
    name = odbo.utils.code_to_array(name_pre)    
else:
    shuffle_order = np.arange(len(Y_test))
    np.random.shuffle(shuffle_order[1:])
    np.save('shuffle_order_GB1_2014.npy', shuffle_order)
    name_pre[1:], Y_test[1:] = name_pre[shuffle_order[1:]], Y_test[shuffle_order[1:]]
    name = odbo.utils.code_to_array(name_pre)
    sele_indices = odbo.initialization.initial_design(name, least_occurance=np.ones(55),allow_abundance=True)
    np.save('sele_indices_GB1_2014.npy', sele_indices)
name_sele, Y_train = name[sele_indices, :], Y_test[sele_indices]
ids_keep = np.delete(range(len(Y_test)), sele_indices)
name, Y_test = name[ids_keep, :], Y_test[ids_keep]
print('Selected initial experiments no. is ', len(Y_train))
print('Select max Y: ', Y_train.max(), 'True max Y:', Y_test.max())


Selected initial experiments no. is  137
Select max Y:  1.576 True max Y: 5.022


In [13]:
def rf_feature_selection(
    X,
    Y,
    threshold=1e-4,
    n_splits= 20,
    test_size=0.5,
    impurity=False,
    feature_sele_random=None,):

    from sklearn.model_selection import ShuffleSplit
    from sklearn.ensemble import RandomForestRegressor
    from collections import defaultdict
    from sklearn.metrics import r2_score

    rf = RandomForestRegressor()
    scores = defaultdict(list)
    if feature_sele_random is None:
        test_train_split = ShuffleSplit(n_splits=n_splits, test_size=test_size)
    elif isinstance(feature_sele_random, int):
        test_train_split = ShuffleSplit(
            n_splits=n_splits, test_size=test_size, random_state=feature_sele_random)
    else:
        raise Exception("The specified feature_sele_random needs to be an integer.")

    for train_idx, test_idx in test_train_split.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx].ravel(), Y[test_idx].ravel()
        rf.fit(X_train, Y_train)
        accuracy = r2_score(Y_test, rf.predict(X_test))
        for i in range(X.shape[1]):
            X_t = X_test
            col_before_shuffle = X_t[:, i].copy()
            np.random.shuffle(X_t[:, i])
            shuffle_accuracy = r2_score(Y_test, rf.predict(X_t))
            X_t[:, i] = col_before_shuffle.copy()
            scores[i].append((accuracy - shuffle_accuracy) / accuracy)

    sorted_scores = sorted(
        [(np.mean(feature_score), feature_index) for feature_index, feature_score in scores.items()],
        reverse=True)

    k = 0
    selected_features = []
    importance = []
    while sorted_scores[k][0] >= threshold:
        selected_features.append(int(sorted_scores[k][1]))
        importance.append(sorted_scores[k][0])
        k = k + 1

    IPR = None
    if impurity:
        renorm = importance / np.sum(importance)
        IPR = 1 / np.sum([i**2 for i in renorm])
    print(importance)
    return selected_features, importance, IPR


In [None]:
l, search_iter = 0, 50
gp_method='gp_regression'
tr_length = [3.2]
batch_size = 1
failure_count,max_count = 0,0
state = odbo.turbo.TurboState(dim=55, batch_size=batch_size, length=tr_length, n_trust_regions=len(tr_length), failure_tolerance = 10)
state.best_value = Y_train.max()

name_sele_temp = np.array(name_sele).copy()
search_name = np.array(name).copy()
Y_train_sele = torch.tensor(Y_train.reshape(len(Y_train),1))

while l < search_iter:
    if Y_train_sele[-batch_size:].detach().numpy().max() < Y_train_sele[:-batch_size].max():
        failure_count = failure_count + 1
        feature_model = odbo.featurization.MeasurementFeatureTransform(raw_vars=name_sele_temp, Y=Y_train_sele.detach().numpy(), method='Max', mode='rank_assist')
    else:
        failure_count = 0
        feature_model = odbo.featurization.MeasurementFeatureTransform(raw_vars=name_sele_temp, Y=Y_train_sele.detach().numpy(), method='Max', mode='rank_assist')
    if failure_count >= 3 and max_count < 3:
        max_count = max_count + 1
        feature_model = odbo.featurization.MeasurementFeatureTransform(raw_vars=name_sele_temp, Y=Y_train_sele.detach().numpy(), method='Avg', mode='rank_assist')
    else:
        max_count = 0
        feature_model = odbo.featurization.MeasurementFeatureTransform(raw_vars=name_sele_temp, Y=Y_train_sele.detach().numpy(), method='Avg', mode='rank_assist')

    X_test_trans= feature_model.transform(search_name)
    X_train_sele_trans = feature_model.transform(name_sele_temp)
    X_test_trans, X_train_sele_trans = torch.tensor(X_test_trans), torch.tensor(X_train_sele_trans)
    threshold = 1.4
    print('Feature transformation done', threshold)

    labels_train = odbo.prescreening.sp_label(X_train_sele_trans, Y_train_sele, thres=threshold)
    pre_model = odbo.prescreening.XGBOD(eval_metric = 'error')
    pre_model.fit(X_train_sele_trans, labels_train)
    pred_test_labels = pre_model.predict(X_test_trans)
    sele_id_test = list(np.where(pred_test_labels == 0)[0])
    del pre_model, pred_test_labels, feature_model
    gc.collect()
    
    print(len(sele_id_test))
    if len(sele_id_test) >= 50000:
        threshold =  min(2.0, Y_train_sele[np.argsort(Y_train_sele)[int(0.99*len(Y_train_sele))]])
        labels_train = odbo.prescreening.sp_label(X_train_sele_trans, Y_train_sele, thres=threshold)
        pre_model = odbo.prescreening.XGBOD(eval_metric = 'error')
        pre_model.fit(X_train_sele_trans, labels_train)
        pred_test_labels = pre_model.predict(X_test_trans)
        sele_id_test = list(np.where(pred_test_labels == 0)[0])
    elif len(sele_id_test) <= 100:
        threshold = Y_train_sele[np.argsort(Y_train_sele)[int(0.99*len(Y_train_sele))]]
        labels_train = odbo.prescreening.sp_label(X_train_sele_trans, Y_train_sele, thres=threshold)
        pre_model = odbo.prescreening.XGBOD(eval_metric = 'error')
        pre_model.fit(X_train_sele_trans, labels_train)
        pred_test_labels = pre_model.predict(X_test_trans)
        sele_id_test = list(np.where(pred_test_labels == 0)[0])

    selected_features = []
    for i in range(X_train_sele_trans.shape[1]):
        if (X_train_sele_trans[:, i]-X_train_sele_trans[0,i]).any() !=0:
            selected_features.append(i)
#     selected_features, importance, IPR = rf_feature_selection(X=X_train_sele_trans.numpy(), Y=Y_train_sele.numpy())
#     print(len(selected_features))
    X_test_trans, X_train_sele_trans = X_test_trans[:, selected_features], X_train_sele_trans[:, selected_features]
            
    print('Prescreened search space size: ', len(sele_id_test))
    search_name_sele = search_name[sele_id_test, :]
    X_test_sele_trans, Y_test_sele = torch.tensor(X_test_trans[sele_id_test, :]), torch.tensor(Y_test[sele_id_test].reshape(len(sele_id_test),1))
    print("Iter: ", l, "Current Max: ", Y_train_sele.max().detach().numpy(), 'TR length: ', state.length, "Test max: ", Y_test_sele.max().detach().numpy())
    X_next, acq_value, raw_next_exp_id = odbo.turbo_design(state=state, X=X_train_sele_trans, Y=Y_train_sele, X_pending=X_test_sele_trans, n_trust_regions=len(tr_length), batch_size=batch_size, gp_method=gp_method)
    Y_next_m = torch.zeros((len(tr_length), batch_size, 1), device=Y_train_sele.device, dtype=Y_train_sele.dtype)
    next_exp_id = []
    for i in range(batch_size):
        next_exp_id_m = raw_next_exp_id[:, i]
        Y_next_m[:, i, 0], idtoadd = Y_test_sele[next_exp_id_m].reshape(len(tr_length)), next_exp_id_m[np.argmax(Y_test_sele[next_exp_id_m])]
        next_exp_id.append(idtoadd)
    Y_train_sele = torch.cat([Y_train_sele, Y_test_sele[next_exp_id]])
    name_sele_temp = np.concatenate((name_sele_temp, search_name_sele[next_exp_id]))
    print(search_name[np.array(sele_id_test)[next_exp_id]])
    ids_keep = np.delete(range(len(search_name)), np.array(sele_id_test)[next_exp_id])
    search_name = search_name[ids_keep]
    print("Newly added value: ", Y_train_sele[-batch_size:].detach().numpy(), name_sele_temp[-1], "Current size: ", len(Y_train_sele))
    state = odbo.turbo.update_state(state=state, Y_next=Y_next_m)
    l = l + 1



Feature transformation done 1.4
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


808
Prescreened search space size:  808
Iter:  0 Current Max:  1.576 TR length:  [3.2] Test max:  3.359
[['Q' 'Y' 'K' 'L' 'I' 'L' 'N' 'G' 'K' 'T' 'L' 'K' 'G' 'E' 'T' 'T' 'T' 'E'
  'A' 'V' 'D' 'T' 'A' 'T' 'A' 'E' 'K' 'V' 'F' 'K' 'L' 'Y' 'A' 'N' 'D' 'N'
  'G' 'V' 'D' 'G' 'E' 'W' 'T' 'Y' 'D' 'D' 'A' 'T' 'K' 'T' 'F' 'T' 'V' 'T'
  'E']]
Newly added value:  [[0.961]] ['Q' 'Y' 'K' 'L' 'I' 'L' 'N' 'G' 'K' 'T' 'L' 'K' 'G' 'E' 'T' 'T' 'T' 'E'
 'A' 'V' 'D' 'T' 'A' 'T' 'A' 'E' 'K' 'V' 'F' 'K' 'L' 'Y' 'A' 'N' 'D' 'N'
 'G' 'V' 'D' 'G' 'E' 'W' 'T' 'Y' 'D' 'D' 'A' 'T' 'K' 'T' 'F' 'T' 'V' 'T'
 'E'] Current size:  138
Feature transformation done 1.4
Parameters: { "silent" } might not be used.
