# Introduction

This notebook provides a preliminary analysis of the robustness of the trained MIT classifier.

# Import packages and functions

In [1]:
import sys
# force the notebook to look for files in the upper level directory
sys.path.insert(1, '../')

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
from glob import glob
from tqdm import tqdm
import pymatgen as mg
import plotly.express as px
import matplotlib.pyplot as plt
from IPython.display import IFrame
from sklearn.impute import KNNImputer
from model.model_building import load_data
from data.data_cleaning import abbreviate_features
from data.compound_featurizer import read_new_struct, composition_featurizer, structure_featurizer, handbuilt_featurizer

# Set up constants

In [3]:
REDUCED_PATH = "../data/processed/IMT_Classification_Dataset_Reduced_Feature_Set_v10.xlsx"
RANDOM_SEED = 31415926

# Read in the reduced dataset

In [4]:
df = pd.read_excel(REDUCED_PATH)
df

Unnamed: 0,Compound,Label,struct_file_path,gii,est_hubbard_u,est_charge_trans,ewald_energy_per_atom,avg_dev_Electronegativity,range_MendeleevNumber,avg_dev_CovalentRadius,avg_mm_dists,avg_mx_dists,avg_xx_dists
0,Ba(FeSb3)4,0,../data/Structures/Metals/BaFe4Sb12_CollCode_6...,0.000000,21.129322,19.461062,-12.916613,0.169412,76,8.608997,4.601000,2.596481,3.266718
1,Ba(NiP)2,0,../data/Structures/Metals/BaNi2P2_SD_1701656.cif,1.080042,11.858755,35.445979,-29.022094,0.371200,74,31.840000,2.790950,2.259574,3.631725
2,Ba(PIr)2,0,../data/Structures/Metals/BaIr2P2_CollCode_957...,3.015802,5.839285,34.042180,-27.929343,0.417600,74,29.120000,2.790243,2.355169,3.754372
3,Ba(PRh)2,0,../data/Structures/Metals/BaRh2P2_CollCode_501...,1.337935,7.810114,33.195560,-28.030653,0.430400,74,28.960000,2.785294,2.346291,3.757078
4,Ba(Sb3Os)4,0,../data/Structures/Metals/BaOs4Sb12_CollCode_6...,1.037038,12.916563,10.425355,-12.725768,0.132595,76,8.276817,4.670000,2.635420,3.315708
...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,VO2,2,../data/Structures/MIT_materials/HighT/VO2_HT_...,0.128538,14.484546,9.779242,-47.781257,0.804444,41,38.666667,3.522330,1.927294,2.675472
339,YCoO3,2,../data/Structures/MIT_materials/HighT/YCoO3_C...,0.188213,13.895560,12.815560,-34.679490,0.907200,75,44.160000,3.716576,1.933264,2.881036
340,YFe4(CuO4)3,2,../data/Structures/MIT_materials/HighT/YCu3Fe4...,0.188611,12.807177,16.608178,-37.225832,0.796800,75,35.160000,3.653850,2.356918,2.745188
341,YNiO3,2,../data/Structures/MIT_materials/HighT/YNiO3_6...,0.231001,15.898371,10.393278,-34.202393,0.900000,75,43.680000,3.755161,1.963601,2.827539


# Load the three trained models

In [5]:
# load the metal vs. non_metal classifier
metal_model = xgb.XGBClassifier()
# metal_model.load_model("./screening_assist_data/metal_reduced.model")
metal_model.load_model("../model/saved_models/new_models/metal_reduced.model")

# load the insulator vs. non_insulator classifier
insulator_model = xgb.XGBClassifier()
# insulator_model.load_model("./screening_assist_data/insulator_reduced.model")
insulator_model.load_model("../model/saved_models/new_models/insulator_reduced.model")

# load the mit vs. non_mit classifier
mit_model = xgb.XGBClassifier()
# mit_model.load_model("./screening_assist_data/mit_reduced.model")
mit_model.load_model("../model/saved_models/new_models/mit_reduced.model")

## Select MITs from the training set

In [6]:
# randomly select 10 MITs from the reduced-feature set
selected_training_mits = df[df.Label == 2].sample(n=10, random_state=RANDOM_SEED)
selected_training_mits

Unnamed: 0,Compound,Label,struct_file_path,gii,est_hubbard_u,est_charge_trans,ewald_energy_per_atom,avg_dev_Electronegativity,range_MendeleevNumber,avg_dev_CovalentRadius,avg_mm_dists,avg_mx_dists,avg_xx_dists
295,GdFe4(CuO4)3,2,../data/Structures/MIT_materials/HighT/GdCu3Fe...,0.335213,16.611023,7.531115,-35.512454,0.798,60,35.52,3.66592,2.362237,2.754201
313,NiSeS,2,../data/Structures/MIT_materials/HighT/NiSeS_C...,0.318498,13.516153,8.891048,-9.425284,0.291111,28,7.555556,4.111826,2.424039,3.060164
341,YNiO3,2,../data/Structures/MIT_materials/HighT/YNiO3_6...,0.231001,15.898371,10.393278,-34.202393,0.9,75,43.68,3.755161,1.963601,2.827539
304,LuFe4(CuO4)3,2,../data/Structures/MIT_materials/HighT/LuCu3Fe...,0.379943,16.5957,7.398845,-35.616996,0.7938,46,34.98,3.651675,2.364168,2.743522
296,GdNiO3,2,../data/Structures/MIT_materials/HighT/GdNiO3_...,0.304129,15.921411,10.41311,-34.071996,0.9048,60,45.12,3.77786,1.9564,2.83782
287,ErNiO3,2,../data/Structures/MIT_materials/HighT/ErNiO3_...,0.230297,15.904702,10.465381,-34.268391,0.8952,52,43.44,3.761371,1.961757,2.912261
333,V4O7,2,../data/Structures/MIT_materials/HighT/V4O7_HT...,0.22407,13.662776,14.397882,-42.046314,0.837686,41,40.264463,3.331325,1.958619,2.801728
308,NbO2,2,../data/Structures/MIT_materials/HighT/NbO2_HT...,0.095388,9.118929,14.785545,-44.774845,0.817778,40,43.555556,3.747107,2.041475,2.873598
285,DyNiO3,2,../data/Structures/MIT_materials/HighT/DyNiO3_...,0.219756,15.919266,10.405029,-34.153332,0.9,56,44.16,3.775735,1.96251,2.929107
309,Nd2Ir2O7,2,../data/Structures/MIT_materials/HighT/Nd2Ir2O...,0.150083,13.075056,12.70654,-39.163938,0.819174,68,48.595041,3.668753,2.013474,2.886355


In [7]:
selected_mits_reduced = selected_training_mits.drop(columns=["Compound", "Label", "struct_file_path"])

In [8]:
# initiate a random instance with a seed to ensure reproducibility
rng = np.random.default_rng(RANDOM_SEED)
# uniformly sample values in [-1, 1)
random_noise = rng.uniform(low=-1, high=1, size=selected_mits_reduced.shape)
pd.DataFrame.from_records(random_noise)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.953383,0.658325,-0.389541,0.346852,-0.909087,-0.547178,-0.709569,0.233215,-0.385987,-0.685978
1,0.507677,0.182206,-0.334497,0.648892,-0.619752,0.614163,0.085282,0.390762,-0.260941,0.943496
2,-0.241764,0.41976,-0.305851,0.550291,0.736776,-0.087831,0.393998,0.149243,-0.31249,0.801001
3,-0.150514,-0.403763,-0.168389,0.224922,-0.136249,0.573701,-0.218003,-0.899334,-0.293578,-0.03574
4,-0.936637,0.428552,-0.357257,-0.27319,0.728458,0.578422,-0.193722,-0.919399,0.197907,0.775907
5,0.500548,-0.64864,-0.178819,-0.061552,0.870458,0.79747,-0.320426,-0.454894,-0.725567,0.486722
6,0.533837,-0.869945,-0.438947,-0.976752,-0.06857,0.221113,0.475535,0.852229,-0.029244,-0.972483
7,0.930063,0.791764,-0.405188,0.773891,-0.633824,0.180922,-0.686499,0.495001,0.457635,0.451579
8,0.758952,0.099406,-0.478958,0.412173,0.615334,0.219571,0.644303,-0.266846,0.168502,0.831372
9,0.850296,0.217524,0.759371,-0.475545,-0.128026,-0.333478,0.808744,0.835949,-0.271153,0.306922


# Evaluate the robustness by adding random noise
The noise is added by multiplying the original feature value with a coefficient. Forexample, to add a ±1% change is to multiply with any value in the range of [-0.99, 1.01)

In [9]:
def eval_robustness(random_noise, df_orig_features, change_perc, pos_class="MIT", 
                    models=[metal_model, insulator_model, mit_model], 
                    classification_columns=["is_metal", "is_metal_proba",
                                            "is_insulator", "is_insulator_proba", 
                                            "is_mit", "is_mit_proba"]):
    """Evaluate the robustness of the model and return the number of positive class"""
    # convert the random noise into multiplying coefficient
    random_noise_coeff = pd.DataFrame.from_records(1 + change_perc * random_noise, columns=df_orig_features.columns)
    # multiply the features with the random noise coefficient element-wise
    df_orig_features_noise = df_orig_features.reset_index(drop=True).mul(random_noise_coeff)
    # get the number of compounds
    num_compounds = df_orig_features_noise.shape[0]
    # initialize an empty list to store all the classification result
    classification_lst = []
    # iterate through all the models
    for model in models:
        # get the binary classification as 0 or 1
        classification = np.reshape(model.predict(df_orig_features_noise), (num_compounds, 1))
        # get the classification probability for the positive class
        classification_proba = np.reshape(model.predict_proba(df_orig_features_noise)[:, 1], (num_compounds, 1))
        # for each model, concatenate the binary classification and classification probability
        classification_lst.append(np.concatenate((classification, classification_proba), axis=1))
    # create a dataframe to store the classification result
    classification_result_df = pd.DataFrame(np.concatenate(classification_lst, axis=1), columns=classification_columns)
    # get the column name
    col_name = "is_%s" % pos_class.lower()
    # get the classification result for positive class
    pos_class_results = classification_result_df[[col_name]]
    # get the number of positive class
    pos_num = sum(pos_class_results[col_name] == 1)
    # rename the classification result
    result_df = pos_class_results.rename(columns={col_name: "%s%%" % (change_perc * 100.)})
    return (pos_num, result_df)

In [10]:
eval_results = []
for change_perc in [0, 0.01, 0.1, 1]:
    result_per_change = eval_robustness(random_noise, selected_mits_reduced, change_perc)
    print("For ±%s%% change, there are %d positive classes" % (change_perc * 100., result_per_change[0]))
    eval_results.append(result_per_change[1])

For ±0.0% change, there are 10 positive classes
For ±1.0% change, there are 10 positive classes
For ±10.0% change, there are 7 positive classes
For ±100.0% change, there are 0 positive classes


In [11]:
results_df = pd.concat(eval_results, axis=1)
results_df = pd.concat([selected_training_mits[["Compound"]].reset_index(drop=True), results_df], axis=1)
binary_convert_dict = {1: "Yes", 0: "No"}
results_df = results_df.replace({change_col: binary_convert_dict for change_col in results_df.drop(columns="Compound").columns})
results_df

Unnamed: 0,Compound,0.0%,1.0%,10.0%,100.0%
0,GdFe4(CuO4)3,Yes,Yes,Yes,No
1,NiSeS,Yes,Yes,Yes,No
2,YNiO3,Yes,Yes,No,No
3,LuFe4(CuO4)3,Yes,Yes,Yes,No
4,GdNiO3,Yes,Yes,Yes,No
5,ErNiO3,Yes,Yes,Yes,No
6,V4O7,Yes,Yes,Yes,No
7,NbO2,Yes,Yes,No,No
8,DyNiO3,Yes,Yes,Yes,No
9,Nd2Ir2O7,Yes,Yes,No,No


In [12]:
def highlight_yes(s):
    """Define a function to highlight 'Yes' with yellow in a pandas series"""
    is_one = s == "Yes"
    return ['background-color: yellow' if v else '' for v in is_one]

In [13]:
results_df.style.apply(highlight_yes, subset=results_df.drop(columns="Compound").columns)

Unnamed: 0,Compound,0.0%,1.0%,10.0%,100.0%
0,GdFe4(CuO4)3,Yes,Yes,Yes,No
1,NiSeS,Yes,Yes,Yes,No
2,YNiO3,Yes,Yes,No,No
3,LuFe4(CuO4)3,Yes,Yes,Yes,No
4,GdNiO3,Yes,Yes,Yes,No
5,ErNiO3,Yes,Yes,Yes,No
6,V4O7,Yes,Yes,Yes,No
7,NbO2,Yes,Yes,No,No
8,DyNiO3,Yes,Yes,Yes,No
9,Nd2Ir2O7,Yes,Yes,No,No


In [14]:
print(results_df.to_latex(caption="Classification results with different random noises. The 4 columns after the Compound column represent changes within\
$\pm0.0\%$, $\pm1.0\%$, $\pm10.0\%$, and $\pm100.0\%$ respectively.\
'Yes' means the compound is still classified as an MIT, while 'No' means it is no longer classified as an MIT.", 
                          label="tab:robustness", index=False))

\begin{table}
\centering
\caption{Classification results with different random noises. The 4 columns after the Compound column represent changes within$\pm0.0\%$, $\pm1.0\%$, $\pm10.0\%$, and $\pm100.0\%$ respectively.'Yes' means the compound is still classified as an MIT, while 'No' means it is no longer classified as an MIT.}
\label{tab:robustness}
\begin{tabular}{lllll}
\toprule
     Compound & 0.0\% & 1.0\% & 10.0\% & 100.0\% \\
\midrule
 GdFe4(CuO4)3 &  Yes &  Yes &   Yes &     No \\
        NiSeS &  Yes &  Yes &   Yes &     No \\
        YNiO3 &  Yes &  Yes &    No &     No \\
 LuFe4(CuO4)3 &  Yes &  Yes &   Yes &     No \\
       GdNiO3 &  Yes &  Yes &   Yes &     No \\
       ErNiO3 &  Yes &  Yes &   Yes &     No \\
         V4O7 &  Yes &  Yes &   Yes &     No \\
         NbO2 &  Yes &  Yes &    No &     No \\
       DyNiO3 &  Yes &  Yes &   Yes &     No \\
     Nd2Ir2O7 &  Yes &  Yes &    No &     No \\
\bottomrule
\end{tabular}
\end{table}

