# Introduction

This notebook provides a preliminary analysis of the extrapolation power of the trained MIT classifier.

# Import packages and functions

In [1]:
import sys
# force the notebook to look for files in the upper level directory
sys.path.insert(1, '../')

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
from glob import glob
from tqdm import tqdm
import plotly.express as px
import matplotlib.pyplot as plt
from IPython.display import IFrame
from sklearn.impute import KNNImputer
from model.model_building import load_data
from data.data_cleaning import abbreviate_features
from data.compound_featurizer import read_new_struct, composition_featurizer, structure_featurizer, handbuilt_featurizer

# Set up constants

In [3]:
REDUCED_PATH = "../data/processed/IMT_Classification_Dataset_Reduced_Feature_Set_v10.xlsx"
FULL_PATH = "../data/processed/IMT_Classification_Dataset_Full_Feature_Set_v10.xlsx"

# Define some helper functions

In [4]:
def assign_oxi_state(elem_symbol):
    """Allow the user to assign oxidation state to each element."""
    oxi_state = input("{}:".format(elem_symbol))
    return float(oxi_state)


def check_all_zero_oxi_state(structure):
    """Check if all the oxidation states in the structure are all zero"""
    try:
        # get all the oxidation states by specie
        oxi_states = [specie.oxi_state for specie in structure.composition.elements]
    except AttributeError:
        # if there are no species but elements present, then return True
        return True
    
    # if the species all have zero oxidation state, also return True
    if np.sum(np.array(oxi_states) == 0) == len(oxi_states):
        return True
    return False


def check_oxi_state(structure):
    """Check if the structure has no oxidation states assigned and the guessed oxidation states are all zero. If so, trigger user input."""
    if (check_all_zero_oxi_state(structure)) and (not structure.composition.oxi_state_guesses()):
        # get all the elements in the input structure
        elem_lst = [element.symbol for element in structure.composition.element_composition.elements]
        # get the reduced formula
        reduced_formula = structure.composition.reduced_formula
        print("Unable to guess oxidation states for {}. Please manually assign oxidation states by element".format(reduced_formula))
        # get a dictionary to overwrite the default guessed oxidation states
        elem_oxi_states = {elem_symbol: [assign_oxi_state(elem_symbol)] for elem_symbol in elem_lst}
        return elem_oxi_states
    return None


def featurizer_wrapper(df_input):
    """A wrapper function around the composition, structure and handbuilt featurizers."""
    # get the structure from the initialized dataframe
    new_struct = df_input.at[0, "structure"]
    # check if the guessed oxidation states are all zeros and allow user-overwrite if true
    oxi_states_by_element = check_oxi_state(new_struct.get_primitive_structure())
    # featurize the given structure using 3 predefined featurizers
    df_output = composition_featurizer(df_input, oxi_states_override=oxi_states_by_element)
    df_output = structure_featurizer(df_output, oxi_states_override=oxi_states_by_element)
    df_output = handbuilt_featurizer(df_output)
    return df_output


def process_new_struct_df(df_new, df_full_set, columns_dropped="Compound"):
    """Process the newly featurized structure(s) and impute any missing values with KNNImputer"""
    new_struct_df_with_name = abbreviate_features(df_new)
    # check if the dataframe contains missing values: if not, then return immediately
    if new_struct_df_with_name.isna().sum(axis=1).sum() == 0:
        return new_struct_df_with_name.drop(columns=columns_dropped), new_struct_df_with_name
    # select the same features as the full feature set
    new_struct_df = new_struct_df_with_name.filter(items=df_full_set.columns).drop(columns=columns_dropped)
    # combine the full feature set with the new structure's features
    df_with_new_struct = pd.concat([df_full_set.drop(columns=["Compound", "Label", "struct_file_path"]), 
                                    new_struct_df], ignore_index=True)
    
    # impute the missing values with the values from the 5 nearest neighbors 
    # weighted by their distances to the new structures' non-missing values
    knn_imputer = KNNImputer(n_neighbors=5, weights="distance")
    # get the imputed dataframe for the new structure
    new_struct_df_imputed = knn_imputer.fit_transform(df_with_new_struct)[-df_new.shape[0]:]
    # add back the column names
    new_struct_df = pd.DataFrame(new_struct_df_imputed, columns=new_struct_df.columns)
    # get the new structure name and create a copy of new_struct_df with the compound name
    new_struct_name = new_struct_df_with_name["Compound"].to_list()
    # get the CIF file paths
    new_struct_path = new_struct_df_with_name["struct_file_path"].to_list()
    new_struct_df_with_name = new_struct_df.copy()
    new_struct_df_with_name["Compound"] = new_struct_name
    new_struct_df_with_name["struct_file_path"] = new_struct_path
    return new_struct_df, new_struct_df_with_name

# Read in the datasets

In [5]:
df = pd.read_excel(REDUCED_PATH)
df_full = pd.read_excel(FULL_PATH)

# Load the three trained models

In [6]:
# load the metal vs. non_metal classifier
metal_model = xgb.XGBClassifier()
# metal_model.load_model("./screening_assist_data/metal_reduced.model")
metal_model.load_model("../model/saved_models/new_models/metal_reduced.model")

# load the insulator vs. non_insulator classifier
insulator_model = xgb.XGBClassifier()
# insulator_model.load_model("./screening_assist_data/insulator_reduced.model")
insulator_model.load_model("../model/saved_models/new_models/insulator_reduced.model")

# load the mit vs. non_mit classifier
mit_model = xgb.XGBClassifier()
# mit_model.load_model("./screening_assist_data/mit_reduced.model")
mit_model.load_model("../model/saved_models/new_models/mit_reduced.model")

# Batch processing

The CIFs used here are taken from the work by [Wang et al](https://aip.scitation.org/doi/10.1063/5.0018811).

In [7]:
batch_folder_path = "../notebooks/bayesian_optimization_cifs/"

Then, get all the CIF paths and read in the structures as a dataframe.

**Note**: By default, any structure is read in as a supercell (a'=2a, b'=2b, c'=2c), which might lead to prolonged featurization time. If you wish not to read in the structures as supercells, please specify the `supercell_matrix` argument as None. 

In [8]:
# initialize an empty list of dataframes
df_lst = []
# get the file paths of all the cif files
cif_file_paths = glob(batch_folder_path + "*.cif")
# iterate over all files and read in the structure
for struct_file_path in cif_file_paths:
    # add the newly read in dataframe to the list
    df_lst.append(read_new_struct(struct_file_path, supercell_matrix=None))
# concatenate all the dataframes in the list
df_batch = pd.concat(df_lst, ignore_index=True)
# insert the struct_file_path column
df_batch.insert(1, "struct_file_path", cif_file_paths)

In [9]:
df_batch

Unnamed: 0,Compound,struct_file_path,structure
0,V3CrInS8,../notebooks/bayesian_optimization_cifs/InCrV3...,"[[-1.55216883 5.01058274 3.534217 ] V, [6.6..."
1,V4GaSe8,../notebooks/bayesian_optimization_cifs/GaV4Se...,"[[0. 0. 6.96663226] V, [8.8817..."
2,AlV3CrSe8,../notebooks/bayesian_optimization_cifs/AlCrV3...,"[[0. 0. 0.01806881] Al, [1.332..."
3,InMo3WSe8,../notebooks/bayesian_optimization_cifs/InWMo3...,"[[0. 0. 0.00303226] In, [8.881..."
4,In(MoSe2)4,../notebooks/bayesian_optimization_cifs/InMo4S...,"[[ 0. 0. 17.88459058] In, [8...."
5,Al(VSe2)4,../notebooks/bayesian_optimization_cifs/AlV4Se...,"[[ 0. 0. 18.11829922] Al, [8...."
6,TaInMo3Se8,../notebooks/bayesian_optimization_cifs/InTaMo...,"[[0. 0. 7.31360225] Ta, [8.881..."
7,AlV3CrS8,../notebooks/bayesian_optimization_cifs/AlCrV3...,"[[0. 0. 0.00188429] Al, [8.881..."
8,V3InWS8,../notebooks/bayesian_optimization_cifs/InWV3S...,"[[-1.52445152 4.98298837 3.4054685 ] V, [6.6..."
9,NbInMo3Se8,../notebooks/bayesian_optimization_cifs/InNbMo...,"[[0. 0. 7.32585697] Nb, [8.881..."


Check the oxidation states of the structures read in.

In [10]:
with tqdm(df_batch.index) as t:
    for row_index in t:
        # print out a progress bar
        t.set_description("Checking %s" % df_batch.at[row_index, "Compound"])
        # access the structure and create a copy
        struct_to_check = df_batch.at[row_index, "structure"].copy()
        # check the oxidation states and ask for input if there is a need to add the oxidation state by hand
        oxi_states_by_element = check_oxi_state(struct_to_check.get_primitive_structure())
        # if there is a need to overwrite the original structure
        if oxi_states_by_element:
            # extract the number from the list of oxidation states for each element
            oxi_states_by_element = {element: oxi_state_lst[0] for element, oxi_state_lst in oxi_states_by_element.items()}
            # add the oxidation states by hand
            struct_to_check.add_oxidation_state_by_element(oxidation_states=oxi_states_by_element)
            # overwrite the original structure in the dataframe
            df_batch.at[row_index, "structure"] = struct_to_check
            # overwrite the original CIF in the screening folder
            CifWriter(struct_to_check).write_file(df_batch.at[row_index, "struct_file_path"])

Checking TaAlV3Se8: 100%|██████████| 12/12 [00:00<00:00, 28.62it/s]


Next, we can featurize the new structures in the batch. 

In [11]:
df_batch_output = composition_featurizer(df_batch)
df_batch_output = structure_featurizer(df_batch_output)
df_batch_output = handbuilt_featurizer(df_batch_output)

HBox(children=(FloatProgress(value=0.0, description='StrToComposition', max=12.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='ElementProperty', max=12.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='CompositionToOxidComposition', max=12.0, style=ProgressSt…




HBox(children=(FloatProgress(value=0.0, description='OxidationStates', max=12.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='StructureToOxidStructure', max=12.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='EwaldEnergy', max=12.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='StructuralHeterogeneity', max=12.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='GlobalInstabilityIndex', max=12.0, style=ProgressStyle(de…




  from pandas import Panel


HBox(children=(FloatProgress(value=0.0, description='Handbuilt Featurizer', max=12.0, style=ProgressStyle(desc…




In [12]:
df_batch_output

Unnamed: 0,Compound,struct_file_path,structure,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,...,max_xx_dists,min_xx_dists,avg_xx_dists,v_m,v_x,iv,iv_p1,est_hubbard_u,est_charge_trans,volume_per_site
0,V3CrInS8,../notebooks/bayesian_optimization_cifs/InCrV3...,"[[-1.55216883 5.01058274 3.534217 ] V, [6.6...","(V, Cr, In, S)",16.0,49.0,33.0,20.769231,5.869822,16.0,...,4.054171,3.094077,3.530984,-26.735333,21.206167,30.959,49.16,16.186355,15.800026,19.773137
1,V4GaSe8,../notebooks/bayesian_optimization_cifs/GaV4Se...,"[[0. 0. 6.96663226] V, [8.8817...","(V, Ga, Se)",23.0,34.0,11.0,30.384615,4.544379,34.0,...,4.069288,3.282759,3.621671,-24.491468,19.62367,33.660575,51.352163,12.994912,,21.106495
2,AlV3CrSe8,../notebooks/bayesian_optimization_cifs/AlCrV3...,"[[0. 0. 0.01806881] Al, [1.332...","(Al, V, Cr, Se)",13.0,34.0,21.0,29.076923,6.059172,34.0,...,4.010538,3.361426,3.665041,-24.739439,19.86525,30.959,49.16,16.247428,,21.732649
3,InMo3WSe8,../notebooks/bayesian_optimization_cifs/InWMo3...,"[[0. 0. 0.00303226] In, [8.881...","(In, Mo, W, Se)",34.0,74.0,40.0,40.076923,7.47929,34.0,...,4.156816,3.183916,3.631877,-13.494845,21.470657,16.37,26.0,7.668372,,22.024222
4,In(MoSe2)4,../notebooks/bayesian_optimization_cifs/InMo4S...,"[[ 0. 0. 17.88459058] In, [8....","(In, Mo, Se)",34.0,49.0,15.0,37.615385,4.449704,34.0,...,4.161451,3.190922,3.634712,-21.978981,19.855984,32.63,46.199583,8.667154,,22.038915
5,Al(VSe2)4,../notebooks/bayesian_optimization_cifs/AlV4Se...,"[[ 0. 0. 18.11829922] Al, [8....","(Al, V, Se)",13.0,34.0,21.0,29.0,6.153846,34.0,...,4.026647,3.318369,3.6207,-24.403355,19.612962,33.660575,51.352163,12.992182,,21.044866
6,TaInMo3Se8,../notebooks/bayesian_optimization_cifs/InTaMo...,"[[0. 0. 7.31360225] Ta, [8.881...","(Ta, In, Mo, Se)",34.0,73.0,39.0,40.0,7.384615,34.0,...,4.070041,3.213229,3.639234,-20.108997,19.877928,25.911111,38.863333,7.94703,,22.501317
7,AlV3CrS8,../notebooks/bayesian_optimization_cifs/AlCrV3...,"[[0. 0. 0.00188429] Al, [8.881...","(Al, V, Cr, S)",13.0,24.0,11.0,18.0,3.230769,16.0,...,3.764011,3.230442,3.477396,-26.333155,21.144436,30.959,49.16,16.14211,15.312313,18.612907
8,V3InWS8,../notebooks/bayesian_optimization_cifs/InWV3S...,"[[-1.52445152 4.98298837 3.4054685 ] V, [6.6...","(V, In, W, S)",16.0,74.0,58.0,24.615385,11.349112,16.0,...,3.925584,3.051603,3.465166,-16.878907,22.446379,16.37,26.0,7.567377,21.894646,19.002175
9,NbInMo3Se8,../notebooks/bayesian_optimization_cifs/InNbMo...,"[[0. 0. 7.32585697] Nb, [8.881...","(Nb, In, Mo, Se)",34.0,49.0,15.0,37.538462,4.35503,34.0,...,4.054711,3.216957,3.637665,-20.127282,19.907494,25.911111,38.863333,7.948624,,22.491839


Just like before, we also need to process the newly featurized structures by imputing the missing values with KNNImputer if there is any, as well as selecting the features in the reduced feature set.

In [13]:
new_batch_df, new_batch_df_with_name = process_new_struct_df(df_batch_output, df_full, columns_dropped=["Compound", "struct_file_path"])
new_batch_df = new_batch_df.filter(items=df.columns)
new_batch_df

Unnamed: 0,gii,est_hubbard_u,est_charge_trans,ewald_energy_per_atom,avg_dev_Electronegativity,range_MendeleevNumber,avg_dev_CovalentRadius,avg_mm_dists,avg_mx_dists,avg_xx_dists
0,0.312014,16.186355,15.800026,-29.513354,0.432663,42.0,20.35503,7.147487,2.437125,3.530984
1,0.353128,12.994912,15.794748,-27.221925,0.418462,43.0,13.964497,3.065924,2.578254,3.621671
2,0.347557,16.247428,14.563869,-27.951787,0.434556,43.0,12.52071,7.370935,2.566246,3.665041
3,0.304919,7.668372,15.289325,-26.119142,0.201657,39.0,15.715976,7.340661,2.652329,3.631877
4,0.974273,8.667154,11.711629,-25.40853,0.220592,39.0,14.95858,2.937248,2.638365,3.634712
5,0.347661,12.992182,14.565743,-27.255018,0.437396,43.0,14.011834,3.064143,2.581328,3.6207
6,0.449155,7.94703,0.133469,-26.344815,0.283077,41.0,16.473373,2.876942,2.65453,3.639234
7,0.325402,16.14211,15.312313,-29.829169,0.448757,42.0,18.366864,6.993891,2.427346,3.477396
8,1.046445,7.567377,21.894646,-29.12174,0.378343,42.0,22.532544,6.981233,2.488437,3.465166
9,0.996577,7.948624,11.853198,-26.293115,0.273609,42.0,15.905325,2.877859,2.650908,3.637665


We are ready to make the classification for all the structures.

In [14]:
# get the number of compounds in the batch
num_compounds = df_batch.shape[0]
# initialize an empty list to store all the classification result
classification_lst = []
# iterate through all the models
for model in [metal_model, insulator_model, mit_model]:
    # get the binary classification as 0 or 1
    classification = np.reshape(model.predict(new_batch_df), (num_compounds, 1))
    # get the classification probability for the positive class
    classification_proba = np.round(np.reshape(model.predict_proba(new_batch_df)[:, 1], (num_compounds, 1)), 4)
    # for each model, concatenate the binary classification and classification probability
    classification_lst.append(np.concatenate((classification, classification_proba), axis=1))

In [15]:
# create a dataframe to store the classification result
classification_result_df = pd.DataFrame(np.concatenate(classification_lst, axis=1), columns=["is_metal", "is_metal_proba",
                                                                                             "is_insulator", "is_insulator_proba", 
                                                                                             "is_mit", "is_mit_proba"])
# add back the compound formula
classification_result_df = pd.concat([new_batch_df_with_name[["Compound", "struct_file_path"]], classification_result_df], axis=1)

In [16]:
classification_result_df = classification_result_df.sort_values(by="Compound", ignore_index=True)
classification_result_df

Unnamed: 0,Compound,struct_file_path,is_metal,is_metal_proba,is_insulator,is_insulator_proba,is_mit,is_mit_proba
0,Al(VSe2)4,../notebooks/bayesian_optimization_cifs/AlV4Se...,1.0,0.8983,0.0,0.107,1.0,0.9005
1,AlV3CrS8,../notebooks/bayesian_optimization_cifs/AlCrV3...,1.0,0.9079,0.0,0.1856,1.0,0.529
2,AlV3CrSe8,../notebooks/bayesian_optimization_cifs/AlCrV3...,1.0,0.9079,0.0,0.1588,0.0,0.4666
3,In(MoSe2)4,../notebooks/bayesian_optimization_cifs/InMo4S...,0.0,0.4761,0.0,0.0645,1.0,0.594
4,InMo3WSe8,../notebooks/bayesian_optimization_cifs/InWMo3...,0.0,0.4761,0.0,0.0942,0.0,0.3107
5,NbInMo3Se8,../notebooks/bayesian_optimization_cifs/InNbMo...,1.0,0.8983,0.0,0.0623,1.0,0.6661
6,TaAlV3Se8,../notebooks/bayesian_optimization_cifs/AlTaV3...,1.0,0.8983,0.0,0.0601,1.0,0.7728
7,TaInMo3Se8,../notebooks/bayesian_optimization_cifs/InTaMo...,1.0,0.8983,0.0,0.0547,1.0,0.6944
8,V3CrInS8,../notebooks/bayesian_optimization_cifs/InCrV3...,1.0,0.9079,0.0,0.1741,1.0,0.6272
9,V3CrInSe8,../notebooks/bayesian_optimization_cifs/InCrV3...,1.0,0.9079,0.0,0.1735,1.0,0.529


In [17]:
def highlight_one(s):
    """Define a function to highlight 1 with yellow in a pandas series"""
    is_one = s == 1
    return ['background-color: yellow' if v else '' for v in is_one]


def highlight_training_data(s):
    """Define a function to highlight 1 with red in a pandas series"""
    is_one = s == 1
    return ['background-color: red' if v else '' for v in is_one]


def retrieve_classification(row):
    if row["in_training_set"] == 1:
        compound_name = row["Compound"]
        training_label = df_full[df_full.Compound == compound_name].reset_index().at[0, "Label"]
        if training_label == 0:
            return "metal"
        elif training_label == 1:
            return "insulator"
        else:
            return "mit"
    return "N/A"

In [18]:
# get a list of all the compounds in training data
training_compounds = df_full["Compound"].to_list()
# create a new column where if the compound is in training set, it will have a value of 1 and 0 otherwise
classification_result_df["in_training_set"] = classification_result_df["Compound"].apply(lambda compound: 1 if compound in training_compounds else 0)

In [19]:
classification_result_df["training_set_label"] = classification_result_df.apply(retrieve_classification, axis=1)

Print the classification result table with each row showing the result for one compound. If a compound is classified as any of the three classes, the class classified will be highlighted with yellow.

In [20]:
classification_result_df.style.format("{:n}", subset=["is_metal", "is_insulator", "is_mit"])\
                              .apply(highlight_one, subset=["is_metal", "is_insulator", "is_mit"])\
                              .apply(highlight_training_data, subset=["in_training_set"])\
                              .format("{:.4f}", subset=["is_metal_proba", "is_insulator_proba", "is_mit_proba"])

Unnamed: 0,Compound,struct_file_path,is_metal,is_metal_proba,is_insulator,is_insulator_proba,is_mit,is_mit_proba,in_training_set,training_set_label
0,Al(VSe2)4,../notebooks/bayesian_optimization_cifs/AlV4Se8.cif,1,0.8983,0,0.107,1,0.9005,0,
1,AlV3CrS8,../notebooks/bayesian_optimization_cifs/AlCrV3S8.cif,1,0.9079,0,0.1856,1,0.529,0,
2,AlV3CrSe8,../notebooks/bayesian_optimization_cifs/AlCrV3Se8.cif,1,0.9079,0,0.1588,0,0.4666,0,
3,In(MoSe2)4,../notebooks/bayesian_optimization_cifs/InMo4Se8.cif,0,0.4761,0,0.0645,1,0.594,0,
4,InMo3WSe8,../notebooks/bayesian_optimization_cifs/InWMo3Se8.cif,0,0.4761,0,0.0942,0,0.3107,0,
5,NbInMo3Se8,../notebooks/bayesian_optimization_cifs/InNbMo3Se8.cif,1,0.8983,0,0.0623,1,0.6661,0,
6,TaAlV3Se8,../notebooks/bayesian_optimization_cifs/AlTaV3Se8.cif,1,0.8983,0,0.0601,1,0.7728,0,
7,TaInMo3Se8,../notebooks/bayesian_optimization_cifs/InTaMo3Se8.cif,1,0.8983,0,0.0547,1,0.6944,0,
8,V3CrInS8,../notebooks/bayesian_optimization_cifs/InCrV3S8.cif,1,0.9079,0,0.1741,1,0.6272,0,
9,V3CrInSe8,../notebooks/bayesian_optimization_cifs/InCrV3Se8.cif,1,0.9079,0,0.1735,1,0.529,0,


In [21]:
binary_convert_dict = {1: "Yes", 0: "No"}
classification_result_df.drop(columns=["struct_file_path", "in_training_set", "training_set_label"]).replace({"is_metal": binary_convert_dict, 
                                                                                                              "is_insulator": binary_convert_dict, 
                                                                                                              "is_mit": binary_convert_dict}).to_csv("./raymond_work_comparison.csv", 
                                                                                                                                                     index=False)

In [22]:
classification_result_df_latex = classification_result_df[["Compound", "is_mit", "is_mit_proba"]].replace({"is_mit": binary_convert_dict})
classification_result_df_latex

Unnamed: 0,Compound,is_mit,is_mit_proba
0,Al(VSe2)4,Yes,0.9005
1,AlV3CrS8,Yes,0.529
2,AlV3CrSe8,No,0.4666
3,In(MoSe2)4,Yes,0.594
4,InMo3WSe8,No,0.3107
5,NbInMo3Se8,Yes,0.6661
6,TaAlV3Se8,Yes,0.7728
7,TaInMo3Se8,Yes,0.6944
8,V3CrInS8,Yes,0.6272
9,V3CrInSe8,Yes,0.529


In [23]:
print(classification_result_df_latex.to_latex(index=False, label="tab:wang_work_comparison", 
                                              caption='Comparison between the MIT predictions made by the Bayesian optimization and those made by the MIT classifier.\
The "Compound" column lists the 12 MIT compounds from the Bayesian optimization; the "is\_mit" column lists the MIT classification from the ML classifier;\
The "is\_mit\_proba" lists the probability of a compound exhibiting MIT as predicted by the ML classifier.'))

\begin{table}
\centering
\caption{Comparison between the MIT predictions made by the Bayesian optimization and those made by the MIT classifier.The "Compound" column lists the 12 MIT compounds from the Bayesian optimization; the "is\_mit" column lists the MIT classification from the ML classifier;The "is\_mit\_proba" lists the probability of a compound exhibiting MIT as predicted by the ML classifier.}
\label{tab:wang_work_comparison}
\begin{tabular}{llr}
\toprule
   Compound & is\_mit &  is\_mit\_proba \\
\midrule
  Al(VSe2)4 &    Yes &        0.9005 \\
   AlV3CrS8 &    Yes &        0.5290 \\
  AlV3CrSe8 &     No &        0.4666 \\
 In(MoSe2)4 &    Yes &        0.5940 \\
  InMo3WSe8 &     No &        0.3107 \\
 NbInMo3Se8 &    Yes &        0.6661 \\
  TaAlV3Se8 &    Yes &        0.7728 \\
 TaInMo3Se8 &    Yes &        0.6944 \\
   V3CrInS8 &    Yes &        0.6272 \\
  V3CrInSe8 &    Yes &        0.5290 \\
    V3InWS8 &     No &        0.2489 \\
    V4GaSe8 &    Yes &        0.8886 \\