In [1]:
%load_ext autoreload
%autoreload 2

import warnings
import os 

import numpy as np
from matplotlib import pyplot as plt 
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, Matern
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

os.chdir("../../")

plt.style.use("src/matplotlib.rc")
warnings.filterwarnings('ignore') 
np.random.seed(47)

Data loading (same as tutorial 1)

In [None]:
!wget https://github.com/usnistgov/remi/raw/nist-pages/data/Combinatorial%20Libraries/Fe-Co-Ni/FeCoNi_benchmark_dataset_220501a.mat
!mkdir datasets
!mv FeCoNi_benchmark_dataset_220501a.mat datasets/

In [2]:
from src.utils import load_ternary_data

X, Y = load_ternary_data('datasets/FeCoNi_benchmark_dataset_220501a.mat')
n_features, n_properties = X.shape[1], Y.shape[1]

x_scaler = MinMaxScaler(feature_range=(0, 1))
y_scaler = MinMaxScaler(feature_range=(-1, 1))

scalers = [x_scaler, y_scaler]

X_norm = x_scaler.fit_transform(X)
Y_norm = y_scaler.fit_transform(Y)



Algorithm definition (same as tutorial 1)

In [3]:
from src.algorithms import SubsetAlgorithm

class PercentileIntersection2D(SubsetAlgorithm):
    def __init__(self, user_algo_params):
        super().__init__(user_algo_params)

    def user_algorithm(self, f_x, x):
        # percentile threshold value for property 1 
        percentile_list = self.user_algo_params['percentile_list']
        percentile_threshold_p1 = np.percentile(f_x[:, 0], percentile_list[0])

        # percentile threshold value for property 2    
        percentile_threshold_p2 = np.percentile(f_x[:, 1], percentile_list[1])

        # Determine the ids where each condition holds seperately 
        ids1 = set(np.where(f_x[:,0] >= percentile_threshold_p1)[0])
        ids2 = set(np.where(f_x[:,1] >= percentile_threshold_p2)[0])

        # Take the union to express the "either/or" logic. 
        return list(ids1.union(ids2))

## Calculating the $\mathsf{Number}$ $\mathsf{Obtained}$ metric

The $\mathsf{Number}$ $\mathsf{Obtained}$ metric quanties how many true target ids have been measured. (Or equivalently, how many measurements actually satisfy the experimental goal.)

In [4]:
user_algo_params = {'scalers': scalers, 'percentile_list': [90, 90]}
algorithm = PercentileIntersection2D(user_algo_params)

# true target ids (ground_truth)
target_subset_ids = algorithm.identify_subspace(f_x = Y_norm, x = X_norm)
print(target_subset_ids)

[14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 537, 538, 25, 26, 27, 28, 29, 721, 722, 566, 567, 56, 57, 58, 59, 60, 61, 62, 64, 65, 67, 68, 72, 74, 75, 595, 84, 596, 597, 98, 99, 100, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 622, 623, 624, 125, 128, 648, 649, 650, 651, 652, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 675, 173, 677, 696, 697, 698, 699, 700, 701, 198, 199, 200, 201, 202, 203, 204, 205, 206, 719, 720, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 723, 724, 740, 741, 742, 743, 744, 760, 761, 762, 763, 764, 765, 779, 780, 781, 782, 783, 784, 797, 798, 799, 800, 801, 802, 814, 815, 816, 817, 818, 673, 829, 830, 831, 674, 832, 833, 676, 843, 844, 845, 846, 847, 856, 857, 858, 859, 868, 869, 870, 871, 879, 880, 881, 889, 890, 897, 898, 904, 905, 910, 914, 915, 918, 920, 506]


Now, let's assume that we have made <code>n_data</code> measurements. 

In [5]:
n_data = 50 # max is X.shape[1]
collected_ids = np.random.choice(np.arange(0, X_norm.shape[0] + 1), size=n_data, replace=False)
print(collected_ids)

[517 236 507 358 907 347 315 520 805 853 533 713   7 654 435 616 485 329
  75 505 455 561 689 645 261  33 296 461 488 872  70 587  79 453 196 400
 371 166 583 530 808 844 518  20 241 157 475 272 383 648]


In [6]:
def get_n_obtained(collected_ids: list, true_target_ids: list) -> int:
    n_obtained = len(set(collected_ids).intersection(set(true_target_ids)))
    return n_obtained

In [7]:
get_n_obtained(collected_ids=collected_ids, true_target_ids=target_subset_ids)

6

Note, the above number is low. This is because we chose datapoints via random sampling. We aim to do significantly better using a smart acquisition approach (see tutorial notebook 3). 

## Calculating the $\mathsf{Posterior}$ $\mathsf{Jaccard}$ $\mathsf{Index}$ metric

The Posterior Jaccard Index quantifies how well the MODEL understands the location/shape of the true target subset. To begin, let's train a simple GP model based on a small amount of data.

In [8]:
from src.models import MGPR

X_train, X_test, y_train, y_test = train_test_split(X_norm, Y_norm, test_size=0.975, random_state=42)
X_train.shape, y_train.shape

kernel_initial = ConstantKernel(constant_value=1.0, constant_value_bounds=[0.01, 3.0]) * Matern(nu = 5/2, length_scale= n_features * [1.0], length_scale_bounds= n_features * [[0.01, 3.0]]) + WhiteKernel(noise_level=0.01, noise_level_bounds='fixed')
kernel_initial_list = n_properties * [kernel_initial]
multi_gpr = MGPR(kernel_list=kernel_initial_list)
multi_gpr.fit(X_train, y_train)


Now, we can take the posterior mean, $\bar{f}$ as the overall prediction of the true function.

In [9]:
posterior_mean, posterior_std = multi_gpr.predict(X_norm)

Essentially, this metric looks at the difference in the ground truth set and the predicted set.

In [10]:
user_algo_params = {'scalers': scalers, 'percentile_list': [90, 90]}
algorithm = PercentileIntersection2D(user_algo_params)

# true target ids (ground_truth)
target_subset_ids = algorithm.identify_subspace(f_x = Y_norm, x = X_norm)
print(target_subset_ids)

print(" ")
# predicted target ids based on posterior mean function
posterior_mean_ids = algorithm.identify_subspace(f_x = posterior_mean, x = X_norm)
print(posterior_mean_ids)

[14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 537, 538, 25, 26, 27, 28, 29, 721, 722, 566, 567, 56, 57, 58, 59, 60, 61, 62, 64, 65, 67, 68, 72, 74, 75, 595, 84, 596, 597, 98, 99, 100, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 622, 623, 624, 125, 128, 648, 649, 650, 651, 652, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 675, 173, 677, 696, 697, 698, 699, 700, 701, 198, 199, 200, 201, 202, 203, 204, 205, 206, 719, 720, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 723, 724, 740, 741, 742, 743, 744, 760, 761, 762, 763, 764, 765, 779, 780, 781, 782, 783, 784, 797, 798, 799, 800, 801, 802, 814, 815, 816, 817, 818, 673, 829, 830, 831, 674, 832, 833, 676, 843, 844, 845, 846, 847, 856, 857, 858, 859, 868, 869, 870, 871, 879, 880, 881, 889, 890, 897, 898, 904, 905, 910, 914, 915, 918, 920, 506]
 
[0, 1, 2, 3, 4, 5, 6, 7, 8, 652, 537, 538, 539, 540, 541, 542, 722, 566, 567, 568, 569, 570, 571, 57

The Jaccard Posterior Index measures the intersection/union of these two sets. This is a metric between 0.0 and 1.0 which measures set overlap; here, 1.0 means the two sets are identical (i.e. a perfect model in the target region of the design space) and 0.0 means the two sets are disjoint (a terrible model in target space).

In [11]:
def get_jaccard_posterior(predicted_target_ids: list, true_target_ids: list) -> float:
    intersection = len(set(predicted_target_ids).intersection(set(true_target_ids)))
    union = len(set(predicted_target_ids).union(set(true_target_ids)))
    jaccard =  intersection/union 
    return jaccard

In [12]:
get_jaccard_posterior(predicted_target_ids = posterior_mean_ids, true_target_ids = target_subset_ids)

0.4144486692015209

Please check out [tutorial_4](tutorial_4_data_acquisition_using_BAX.ipynb) for an example of data acquisition using BAX.