In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from botorch.acquisition.active_learning import qNegIntegratedPosteriorVariance
from botorch.models.gp_regression import SingleTaskGP
from tqdm import tqdm
from torch import Tensor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer,StandardScaler
import os

from botorch.exceptions.warnings import BotorchTensorDimensionWarning, InputDataWarning 

warnings.filterwarnings(
            "ignore",
            message="Input data is not standardized.",
            category=InputDataWarning,
        )

In [2]:
os.getcwd()

'/Users/ramseyissa/Documents/GitHub/qNIPV/notebooks'

In [3]:
df = pd.read_csv('../datasets/citrine_thermal_conductivity.csv')
df

Unnamed: 0,formula,k_expt,k-units,k_condition,k_condition_units
0,BeS,157.0,W/m.K,room temperature,"[{'name': 'Temperature', 'scalars': [{'value':..."
1,CdS,19.9,W/m.K,room temperature,"[{'name': 'Temperature', 'scalars': [{'value':..."
2,GaN,181.0,W/m.K,room temperature,"[{'name': 'Temperature', 'scalars': [{'value':..."
3,ZnO,64.5,W/m.K,room temperature,"[{'name': 'Temperature', 'scalars': [{'value':..."
4,ZnSe,15.6,W/m.K,room temperature,"[{'name': 'Temperature', 'scalars': [{'value':..."
...,...,...,...,...,...
867,SiC,40.0,Wm$^{-1}$K$^{-1}$,1773,K
868,Al2O3,6.0,Wm$^{-1}$K$^{-1}$,1773,K
869,ZrO2,2.4,Wm$^{-1}$K$^{-1}$,1773,K
870,ThO2,2.0,Wm$^{-1}$K$^{-1}$,1773,K


In [6]:
df.head()
df.value_counts()

formula            k_expt    k-units            k_condition       k_condition_units                                                    
Sr0.61Ba0.39Nb2O6  1.6670    W/m$\cdot$K        300               K                                                                        3
TiO2               0.3800    W\m K              Room temperature  [{'name': 'Temperature', 'scalars': [{'value': 'Room temperature'}]}]    3
CuBr               2.7500    W/m.K              room temperature  [{'name': 'Temperature', 'scalars': [{'value': 'room temperature'}]}]    2
GaN                181.0000  W/m.K              room temperature  [{'name': 'Temperature', 'scalars': [{'value': 'room temperature'}]}]    2
BeO                447.0000  W/m.K              room temperature  [{'name': 'Temperature', 'scalars': [{'value': 'room temperature'}]}]    2
                                                                                                                                          ..
CuCr0.97Mg0.03O2  

In [8]:
df['k_condition'].value_counts()

k_condition
300                 204
400                 187
700                 183
1000                129
room temperature     42
773                  25
373                  24
Room temperature     22
298                  19
1273                 19
1773                 10
Standard              8
Name: count, dtype: int64

In [4]:
for val in df['k_condition'].values:
    if val == 'room temperature':
        df['k_condition'] = df['k_condition'].replace(val, 300)
    elif val == 'Standard':
        df['k_condition'] = df['k_condition'].replace(val, 300)
    elif val == 'Room temperature':
        df['k_condition'] = df['k_condition'].replace(val, 300)
    else:
        pass

    

In [11]:
df['k_condition'].value_counts()

k_condition
300     204
400     187
700     183
1000    129
300      72
773      25
373      24
298      19
1273     19
1773     10
Name: count, dtype: int64

In [15]:
df['k_condition'].value_counts()

k_condition
300     204
400     187
700     183
1000    129
300      72
773      25
373      24
298      19
1273     19
1773     10
Name: count, dtype: int64

In [17]:
df.columns

Index(['formula', 'k_expt', 'k-units', 'k_condition', 'k_condition_units'], dtype='object')

In [5]:
df.head()
df.drop(columns=['k-units','k_condition_units'],inplace=True)
df.head()

Unnamed: 0,formula,k_expt,k_condition
0,BeS,157.0,300
1,CdS,19.9,300
2,GaN,181.0,300
3,ZnO,64.5,300
4,ZnSe,15.6,300


In [6]:
x = 0
for indx,row in df.iterrows():
    if row['k_condition'] == 300 or row['k_condition'] == 298:
        x += 1
print(x)
        
# convert the k_condition column to float
df['k_condition'] = df['k_condition'].astype(float)



72


In [7]:
df['k_condition'].value_counts()

#create mask for certain values in the k_condition column
mask = (df['k_condition'] == 300.0) | (df['k_condition'] == 298.0)
df_mask = df[mask]
df_mask


Unnamed: 0,formula,k_expt,k_condition
0,BeS,157.0,300.0
1,CdS,19.9,300.0
2,GaN,181.0,300.0
3,ZnO,64.5,300.0
4,ZnSe,15.6,300.0
...,...,...,...
796,SiO2,11.0,298.0
797,Al2O3,38.0,298.0
798,ZrO2,1.8,298.0
799,ThO2,14.0,298.0


In [8]:
df_mask.reset_index(drop=True,inplace=True)
df_mask

Unnamed: 0,formula,k_expt,k_condition
0,BeS,157.0,300.0
1,CdS,19.9,300.0
2,GaN,181.0,300.0
3,ZnO,64.5,300.0
4,ZnSe,15.6,300.0
...,...,...,...
290,SiO2,11.0,298.0
291,Al2O3,38.0,298.0
292,ZrO2,1.8,298.0
293,ThO2,14.0,298.0


In [9]:
df_mask.drop(columns=['k_condition'],inplace=True)
df_mask

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mask.drop(columns=['k_condition'],inplace=True)


Unnamed: 0,formula,k_expt
0,BeS,157.0
1,CdS,19.9
2,GaN,181.0
3,ZnO,64.5
4,ZnSe,15.6
...,...,...
290,SiO2,11.0
291,Al2O3,38.0
292,ZrO2,1.8
293,ThO2,14.0


In [11]:
for vals in df_mask['formula'].value_counts().index:
    if df_mask['formula'].value_counts()[vals] > 1:
        mask_df = df_mask.loc[df_mask['formula'] == vals]
        mean_val = mask_df['k_expt'].mean()
        
        print(vals,mean_val)
        #replace the k_expt values with the mean value
        df_mask.loc[df_mask['formula'] == vals, 'k_expt'] = mean_val
        #drop the duplicate rows
        
        
    

TiO2 0.7622727272727272
Ba8Ga16Ge30 1.79
Zn4Sb3 0.8307499999999999
SiC 284.6666666666667
SiO2 6.366666666666667
Bi2Te3 2.4949707406666666
AlN 168.66666666666666
Sb2Te3 3.1110078933333334
Sr0.61Ba0.39Nb2O6 1.667
CaMnO3 7.5938799999999995
BeO 398.0
ZnO 59.23333333333333
NaCo2O4 10.491999999999999
Tl2SnTe5 5.92
CeFe3CoSb12 1.6400000000000001
TiNiSn 6.89805
Zr0.5Hf0.5NiSn 4.0179779035
Yb14MnSb11 0.97
Mg2Si 5.875
CeFe4Sb12 8.275
CdS 19.9
Ca3Co4O9 2.755
AgI 2.44
CuI 7.1
GaN 181.0
CuBr 2.75
SrTi0.8Nb0.2O3 9.19
In0.2Co4Sb12 2.5231199999999996
CuCl 1.26


In [12]:
df_mask.drop_duplicates(subset=['formula'],inplace=True)
df_mask

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mask.drop_duplicates(subset=['formula'],inplace=True)


Unnamed: 0,formula,k_expt
0,BeS,157.000000
1,CdS,19.900000
2,GaN,181.000000
3,ZnO,59.233333
4,ZnSe,15.600000
...,...,...
287,Si,150.000000
291,Al2O3,38.000000
292,ZrO2,1.800000
293,ThO2,14.000000


In [13]:
df_mask['target'] = df_mask['k_expt']   
df_mask

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mask['target'] = df_mask['k_expt']


Unnamed: 0,formula,k_expt,target
0,BeS,157.000000,157.000000
1,CdS,19.900000,19.900000
2,GaN,181.000000,181.000000
3,ZnO,59.233333,59.233333
4,ZnSe,15.600000,15.600000
...,...,...,...
287,Si,150.000000,150.000000
291,Al2O3,38.000000,38.000000
292,ZrO2,1.800000,1.800000
293,ThO2,14.000000,14.000000


In [14]:
df_mask.drop(columns=['k_expt'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mask.drop(columns=['k_expt'],inplace=True)


### dataset cleaned

In [15]:
from CBFV import composition
X, y, formulae, skipped = composition.generate_features(df_mask)
X 

python(56233) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Processing Input Data: 100%|██████████| 233/233 [00:00<00:00, 36029.82it/s]


	Featurizing Compositions...


Assigning Features...: 100%|██████████| 233/233 [00:00<00:00, 16354.66it/s]

	Creating Pandas Objects...





Unnamed: 0,avg_Atomic_Number,avg_Atomic_Weight,avg_Period,avg_group,avg_families,avg_Metal,avg_Nonmetal,avg_Metalliod,avg_Mendeleev_Number,avg_l_quantum_number,...,mode_polarizability(A^3),mode_Melting_point_(K),mode_Boiling_Point_(K),mode_Density_(g/mL),mode_specific_heat_(J/g_K)_,mode_heat_of_fusion_(kJ/mol)_,mode_heat_of_vaporization_(kJ/mol)_,mode_thermal_conductivity_(W/(m_K))_,mode_heat_atomization(kJ/mol),mode_Cohesive_energy
0,10.000000,20.539090,2.500000,9.000000,4.500000,0.500000,0.500000,0.0,77.500000,0.500000,...,2.900,385.95,717.85,1.85000,0.71,1.71750,9.8000,0.26900,279.0,2.85
1,32.000000,72.238500,4.000000,14.000000,5.500000,0.500000,0.500000,0.0,79.000000,0.500000,...,2.900,385.95,717.85,2.07000,0.23,1.71750,9.8000,0.26900,112.0,1.16
2,19.000000,41.864870,3.000000,14.000000,6.000000,0.500000,0.500000,0.0,78.000000,1.000000,...,1.100,63.25,77.35,0.00125,0.37,0.36040,2.7928,0.02598,286.0,2.81
3,19.000000,40.694700,3.000000,14.000000,5.500000,0.500000,0.500000,0.0,78.000000,1.500000,...,0.793,54.75,90.15,0.00143,0.39,0.22259,3.4099,0.02674,131.0,1.35
4,32.000000,72.175000,4.000000,14.000000,5.500000,0.500000,0.500000,0.0,79.000000,1.500000,...,3.800,490.15,958.15,4.79000,0.32,6.69400,37.7000,0.52000,131.0,1.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,14.000000,28.085500,3.000000,14.000000,6.000000,0.000000,1.000000,0.0,78.000000,1.000000,...,5.400,1683.15,2628.15,2.33000,0.71,50.55000,384.2200,148.00000,452.0,4.63
229,10.000000,20.392256,2.400000,14.800000,6.200000,0.400000,0.600000,0.0,81.400000,1.000000,...,0.793,54.75,90.15,0.00143,0.92,0.22259,3.4099,0.02674,249.0,2.62
230,18.666667,41.074267,3.000000,12.000000,6.000000,0.333333,0.666667,0.0,72.666667,1.333333,...,0.793,54.75,90.15,0.00143,0.92,0.22259,3.4099,0.02674,249.0,2.62
231,35.333333,88.012300,3.666667,11.666667,5.666667,0.333333,0.666667,0.0,63.333333,1.333333,...,0.793,54.75,90.15,0.00143,0.92,0.22259,3.4099,0.02674,249.0,2.62


In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from botorch.models.gp_regression import SingleTaskGP
from botorch.models.model import Model
from tqdm import tqdm
from torch import Tensor
from botorch.acquisition.active_learning import (
    MCSampler,
    qNegIntegratedPosteriorVariance,
)

from botorch.fit import fit_gpytorch_mll
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer,StandardScaler
from botorch.models.gp_regression import SingleTaskGP

from sklearn.metrics import mean_absolute_error
import torch
from gpytorch.mlls import ExactMarginalLogLikelihood

# from botorch.models.fully_bayesian import SaasFullyBayesianSingleTaskGP
# from botorch.fit import fit_fully_bayesian_model_nuts
# from gpytorch.likelihoods.likelihood import Likelihood
# from gpytorch.means.constant_mean import ConstantMean
# from gpytorch.means.mean import Mean
# from gpytorch.models.exact_gp import ExactGP
from botorch.models import SingleTaskGP
import os 

import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
import warnings


# warnings.filterwarnings("ignore", category=botorch.exceptions.BotorchWarning)

from botorch.exceptions.warnings import BotorchTensorDimensionWarning, InputDataWarning
warnings.filterwarnings(
            "ignore",
            message="Input data is not standardized.",
            category=InputDataWarning,
        )

In [61]:
df_mask.reset_index(drop=True,inplace=True)

In [107]:
# y_df = pd.DataFrame({'k_expt':y.index, 'list':y.values})
y_df = y.to_frame(name='k_expt')
y_df


Unnamed: 0,k_expt
0,157.000000
1,19.900000
2,181.000000
3,59.233333
4,15.600000
...,...
228,150.000000
229,38.000000
230,1.800000
231,14.000000


### qNIPV

In [None]:

def qnipv_runs() -> list:

    rand_selection_mae = []
    xmax_candidates = []
    pred_mae = []
    pred_y = []
    pred_std = []
    qnipv_runs =[]

    def find_max_normalized_acqval(tensor_list, qNIVP):
        max_value = None
        max_index = -1
        acq_val_lst = []
        # torch.manual_seed(13)
        for i, tensor_ in enumerate(tensor_list):
            tensor = tensor_.unsqueeze(0)
            qNIVP_val = qNIVP(tensor)
            acq_val_lst.append(qNIVP_val.item())  # Assuming it's a scalar tensor

            # Check if this is the maximum value so far
            if max_value is None or qNIVP_val > max_value:
                max_value = qNIVP_val
                max_index = i

        return max_value, max_index, acq_val_lst

    for i in tqdm(seeds):
        xcandidates = xcandidates_original.clone()
        ycandidates = ycandidates_original.clone()
        xinit, yinit, xcandidates, ycandidates = random_initial_data(xcandidates, ycandidates, 0.05, seed=i)
        gp = SingleTaskGP(xinit, yinit)
        mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
        fit_gpytorch_mll(mll)
        posterior = gp(xtest)
        ypred = posterior.mean.detach().numpy()
        ystd = posterior.stddev.detach().numpy()
        
        
        # pred_y.append(ypred_mean)
        ymae = mean_absolute_error(ytest, ypred)
        
        pred_mae = []
        pred_y.append(ypred)
        pred_std.append(ystd)
        pred_mae.append(ymae)

        for inner_i in tqdm(range(len(xcandidates))):
            if not len(xcandidates):
                break
            
            qNIVP = qNegIntegratedPosteriorVariance(gp, mc_points= mcp)
            
            
            max_value, max_index, acq_val_lst = find_max_normalized_acqval(xcandidates, qNIVP)
            xmax_candidates.append(max_index)
            # print("len of tensor:",len(xcandidates[max_index]))
            # print("tensor shape:",xcandidates[max_index].shape)
            # print("candidate tensor",xcandidates[max_index])
            # print("shape of xinit:",xinit.shape)
            # add the new point to the training set
            
            
            xinit= torch.cat((xinit, xcandidates[max_index].unsqueeze(0)), 0)
            yinit = torch.cat((yinit, ycandidates[max_index].unsqueeze(0)), 0)
            
            # print('len of new train:', len(xinit))
                
            xcandidates = torch.cat((xcandidates[:max_index], xcandidates[max_index + 1:]))
            ycandidates = torch.cat((ycandidates[:max_index], ycandidates[max_index + 1:]))
            
            
            gp = SingleTaskGP(xinit, yinit) 
            # gp = SingleTaskGP(xinit, ytrain_,covar_module=rbf_kernel)
            mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
            fit_gpytorch_mll(mll)
            #predict the y values for the test set
            ypred = gp(xtest)
            ypred_mean = ypred.mean.detach().numpy()
            pred_y.append(ypred_mean)

            #calculate the mean absolute error and the standard deviation for the test set
            ymae = mean_absolute_error(ytest, ypred_mean)
            # print('mean absolute error: ', ymae)
            pred_mae.append(ymae)
            ystd = gp(xtest).stddev
            ystd = ystd.detach().numpy()
            pred_std.append(ystd)
        qnipv_runs.append(pred_mae)
    return qnipv_runs, gp