In [1]:
import numpy as np
import glob
import pandas as pd
import pymatgen as mg
import tqdm

In [2]:
def generate_labels_from_structures():
    """Creates a DataFrame with labels based on structure files."""
    counter = 0
    data = {}
    for filename in glob.glob("../Structures/Metals/*.cif"):
        struct = mg.Structure.from_file(filename)
        formula = struct.composition.reduced_formula
        filepath = filename.split('Structures/')[-1]
        data[counter] = {'Compound': formula, 'Label': 0, 'struct_file_path': filepath}
        counter+=1
    
    for filename in glob.glob("../Structures/Insulators/*.cif"):
        struct = mg.Structure.from_file(filename)
        formula = struct.composition.reduced_formula
        filepath = filename.split('Structures/')[-1]
        data[counter] = {'Compound': formula, 'Label': 1, 'struct_file_path': filepath}
        counter+=1 
    
    for filename in glob.glob("../Structures/MIT_materials/*/*.cif"):
        struct = mg.Structure.from_file(filename)
        formula = struct.composition.reduced_formula
        filepath = filename.split('Structures/')[-1]
        data[counter] = {'Compound': formula, 'Label': 2, 'struct_file_path': filepath}
        counter+=1
    
    df = pd.DataFrame.from_dict(data, orient='index')
    return df

In [3]:
df = generate_labels_from_structures()



In [4]:
# Manually correct Pymatgen naming PbTiO3 as TiPbO3
df.loc[df['Compound']=='TiPbO3', 'Compound'] = 'PbTiO3'
df.loc[df['Compound']=='PbTiO3']

Unnamed: 0,Compound,Label,struct_file_path
98,PbTiO3,1,Insulators/PbTiO3_61168.cif


In [5]:
df.to_csv("../data/IMT_Classifcation_Dataset_raw.csv", index=False)
df.head()

Unnamed: 0,Compound,Label,struct_file_path
0,W18O49,0,Metals/W18O49_15254.cif
1,CaCrO3,0,Metals/CaCrO3_245840.cif
2,TiO,0,Metals/TiO_56612.cif
3,SrLaNiO4,0,Metals/LaSrNiO4_CollCode69174.cif
4,Sr2RuO4,0,Metals/Sr2RuO4_41604.cif


# Composition based features

In [6]:
# Convert formula strings to pymatgen Composition objects
from matminer.featurizers.conversions import StrToComposition
df = StrToComposition().featurize_dataframe(df, 'Compound')
df.head()

HBox(children=(IntProgress(value=0, description='StrToComposition', max=235, style=ProgressStyle(description_w…




Unnamed: 0,Compound,Label,struct_file_path,composition
0,W18O49,0,Metals/W18O49_15254.cif,"(W, O)"
1,CaCrO3,0,Metals/CaCrO3_245840.cif,"(Ca, Cr, O)"
2,TiO,0,Metals/TiO_56612.cif,"(Ti, O)"
3,SrLaNiO4,0,Metals/LaSrNiO4_CollCode69174.cif,"(Sr, La, Ni, O)"
4,Sr2RuO4,0,Metals/Sr2RuO4_41604.cif,"(Sr, Ru, O)"


In [7]:
# Add composition features based on elemental properties
from matminer.featurizers.composition import ElementProperty

ep_feat = ElementProperty.from_preset(preset_name="magpie")
df = ep_feat.featurize_dataframe(df, col_id="composition")
df.head()

HBox(children=(IntProgress(value=0, description='ElementProperty', max=235, style=ProgressStyle(description_wi…




Unnamed: 0,Compound,Label,struct_file_path,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,W18O49,0,Metals/W18O49_15254.cif,"(W, O)",8.0,74.0,66.0,25.731343,25.935398,8.0,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,70.298507,85.272444,12.0
1,CaCrO3,0,Metals/CaCrO3_245840.cif,"(Ca, Cr, O)",8.0,24.0,16.0,13.6,6.72,8.0,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,98.0,103.2,12.0
2,TiO,0,Metals/TiO_56612.cif,"(Ti, O)",8.0,22.0,14.0,15.0,7.0,8.0,...,2.3e-05,1.1e-05,1.1e-05,0.0,12.0,194.0,182.0,103.0,91.0,12.0
3,SrLaNiO4,0,Metals/LaSrNiO4_CollCode69174.cif,"(Sr, La, Ni, O)",8.0,57.0,49.0,22.142857,16.163265,8.0,...,0.595395,0.085056,0.145811,0.0,12.0,225.0,213.0,98.857143,99.265306,12.0
4,Sr2RuO4,0,Metals/Sr2RuO4_41604.cif,"(Sr, Ru, O)",8.0,44.0,36.0,21.714286,15.673469,8.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,98.857143,99.265306,12.0


In [8]:
# Oxidation states are not initially provided, so we have Pymatgen try to guess
# This does not work with non-integer stoichiometries, so we force matminer to ignore errors
from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates

df = CompositionToOxidComposition(return_original_on_error=True,).featurize_dataframe(
    df, "composition", ignore_errors=True
)

HBox(children=(IntProgress(value=0, description='CompositionToOxidComposition', max=235, style=ProgressStyle(d…




In [9]:
# Add descriptors based on oxidation state of elements in the Composition

os_feat = OxidationStates()
df = os_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True)
df.head()

HBox(children=(IntProgress(value=0, description='OxidationStates', max=235, style=ProgressStyle(description_wi…




Unnamed: 0,Compound,Label,struct_file_path,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,...,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,composition_oxid,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state
0,W18O49,0,Metals/W18O49_15254.cif,"(W, O)",8.0,74.0,66.0,25.731343,25.935398,8.0,...,229.0,217.0,70.298507,85.272444,12.0,"(W2+, W4+, W6+, O2-)",-2.0,6.0,8.0,5.235056
1,CaCrO3,0,Metals/CaCrO3_245840.cif,"(Ca, Cr, O)",8.0,24.0,16.0,13.6,6.72,8.0,...,229.0,217.0,98.0,103.2,12.0,"(Ca2+, Cr4+, O2-)",-2.0,4.0,6.0,3.380617
2,TiO,0,Metals/TiO_56612.cif,"(Ti, O)",8.0,22.0,14.0,15.0,7.0,8.0,...,194.0,182.0,103.0,91.0,12.0,"(Ti2+, O2-)",-2.0,2.0,4.0,2.828427
3,SrLaNiO4,0,Metals/LaSrNiO4_CollCode69174.cif,"(Sr, La, Ni, O)",8.0,57.0,49.0,22.142857,16.163265,8.0,...,225.0,213.0,98.857143,99.265306,12.0,"(Sr2+, La3+, Ni3+, O2-)",-2.0,3.0,5.0,2.977695
4,Sr2RuO4,0,Metals/Sr2RuO4_41604.cif,"(Sr, Ru, O)",8.0,44.0,36.0,21.714286,15.673469,8.0,...,225.0,213.0,98.857143,99.265306,12.0,"(Sr2+, Ru4+, O2-)",-2.0,4.0,6.0,3.162278


In [10]:
# Count how many oxidation state values are missing
# Looks like only 19
df.iloc[:, -5:].isna().sum()

composition_oxid            0
minimum oxidation state    19
maximum oxidation state    19
range oxidation state      19
std_dev oxidation state    19
dtype: int64

In [11]:
df.loc[df['minimum oxidation state'] == 0]

Unnamed: 0,Compound,Label,struct_file_path,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,...,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,composition_oxid,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state
8,SrFeO3,0,Metals/SrFeO3_91062.cif,"(Sr, Fe, O)",8.0,38.0,30.0,17.6,11.52,8.0,...,229.0,217.0,98.0,103.2,12.0,"(Sr0+, Fe0+, O0+)",0.0,0.0,0.0,0.0
15,PrO,0,Metals/PrO_77652.cif,"(Pr, O)",8.0,59.0,51.0,33.5,25.5,8.0,...,194.0,182.0,103.0,91.0,12.0,"(Pr0+, O0+)",0.0,0.0,0.0,0.0
35,CeO,0,Metals/CeO_52886.cif,"(Ce, O)",8.0,58.0,50.0,33.0,25.0,8.0,...,194.0,182.0,103.0,91.0,12.0,"(Ce0+, O0+)",0.0,0.0,0.0,0.0
46,Si,1,Insulators/Si_diamond_CollCode51688.cif,(Si),14.0,14.0,0.0,14.0,0.0,14.0,...,227.0,0.0,227.0,0.0,227.0,(Si0+),0.0,0.0,0.0,0.0
134,CaFeO3,2,MIT_materials/LowT/CaFeO3_LT_92336.cif,"(Ca, Fe, O)",8.0,26.0,18.0,14.0,7.2,8.0,...,229.0,217.0,98.0,103.2,12.0,"(Ca0+, Fe0+, O0+)",0.0,0.0,0.0,0.0
206,CaFeO3,2,MIT_materials/HighT/CaFeO3_HT_92330.cif,"(Ca, Fe, O)",8.0,26.0,18.0,14.0,7.2,8.0,...,229.0,217.0,98.0,103.2,12.0,"(Ca0+, Fe0+, O0+)",0.0,0.0,0.0,0.0


In [12]:
# Remove compounds with predicted oxidation states of 0 
df = df.loc[df['minimum oxidation state'] != 0]
df.describe()

Unnamed: 0,Label,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,MagpieData maximum MendeleevNumber,MagpieData range MendeleevNumber,...,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state
count,229.0,229.0,229.0,229.0,229.0,229.0,229.0,229.0,229.0,229.0,...,229.0,229.0,229.0,229.0,229.0,229.0,210.0,210.0,210.0,210.0
mean,1.336245,8.49345,47.943231,39.449782,20.417389,14.201585,8.558952,30.742358,87.074236,56.331878,...,15.80786,218.694323,202.886463,94.784934,93.911435,15.80786,-2.0,3.547619,5.547619,3.443955
std,0.716685,2.139069,18.957508,19.198258,6.192139,7.16413,2.043876,21.567499,0.278923,21.488754,...,14.379497,16.585319,22.36647,15.640737,10.756462,14.379497,0.0,1.002446,1.002446,0.530358
min,0.0,3.0,12.0,4.0,9.458333,2.0,8.0,1.0,87.0,6.0,...,12.0,141.0,124.0,45.090909,54.14876,12.0,-2.0,2.0,4.0,2.54951
25%,1.0,8.0,26.0,18.0,16.2642,8.16,8.0,8.0,87.0,38.0,...,12.0,217.0,182.0,84.8,87.36,12.0,-2.0,3.0,5.0,3.162278
50%,1.0,8.0,57.0,49.0,19.0,12.10384,8.0,25.0,87.0,62.0,...,12.0,225.0,213.0,94.0,95.76,12.0,-2.0,3.0,5.0,3.273268
75%,2.0,8.0,60.0,52.0,24.0,17.76,8.0,49.0,87.0,79.0,...,12.0,229.0,217.0,98.8,101.34,12.0,-2.0,4.0,6.0,3.557282
max,2.0,16.0,83.0,75.0,45.0,37.0,16.0,81.0,89.0,86.0,...,70.0,229.0,217.0,149.5,108.5,70.0,-2.0,7.0,9.0,5.656854


In [13]:
df_composition = df.copy()
# df_composition.to_excel("../data/IMT_Classifcation_Dataset_composition_features.xlsx")

# Structure based features

In [14]:
# Load structures into dataframe
def load_structure(path):
    """Uses path string to read in Pymatgen structure object."""
    struct = mg.Structure.from_file("../Structures/" + path)
    return struct

In [15]:
df['structure'] = df['struct_file_path'].apply(load_structure)
df.head()


Issues encountered while parsing CIF: Some fractional co-ordinates rounded to ideal values to avoid finite precision errors.



Unnamed: 0,Compound,Label,struct_file_path,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,...,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,composition_oxid,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,structure
0,W18O49,0,Metals/W18O49_15254.cif,"(W, O)",8.0,74.0,66.0,25.731343,25.935398,8.0,...,217.0,70.298507,85.272444,12.0,"(W2+, W4+, W6+, O2-)",-2.0,6.0,8.0,5.235056,"[[15.37975385 1.893 6.78296238] W5+, [ ..."
1,CaCrO3,0,Metals/CaCrO3_245840.cif,"(Ca, Cr, O)",8.0,24.0,16.0,13.6,6.72,8.0,...,217.0,98.0,103.2,12.0,"(Ca2+, Cr4+, O2-)",-2.0,4.0,6.0,3.380617,"[[2.58311502 2.83194208 1.872325 ] Ca2+, [0.0..."
2,TiO,0,Metals/TiO_56612.cif,"(Ti, O)",8.0,22.0,14.0,15.0,7.0,8.0,...,182.0,103.0,91.0,12.0,"(Ti2+, O2-)",-2.0,2.0,4.0,2.828427,"[[0. 0. 0.] Ti2+, [3.38051503e-16 2.10215000e+..."
3,SrLaNiO4,0,Metals/LaSrNiO4_CollCode69174.cif,"(Sr, La, Ni, O)",8.0,57.0,49.0,22.142857,16.163265,8.0,...,213.0,98.857143,99.265306,12.0,"(Sr2+, La3+, Ni3+, O2-)",-2.0,3.0,5.0,2.977695,"[[0. 0. 7.92112] Sr2+:0.500, La3+:0...."
4,Sr2RuO4,0,Metals/Sr2RuO4_41604.cif,"(Sr, Ru, O)",8.0,44.0,36.0,21.714286,15.673469,8.0,...,213.0,98.857143,99.265306,12.0,"(Sr2+, Ru4+, O2-)",-2.0,4.0,6.0,3.162278,"[[0. 0. 8.24768178] Sr2+, [0. ..."


In [16]:
from matminer.featurizers.conversions import StructureToOxidStructure
from matminer.featurizers.structure import EwaldEnergy

df = StructureToOxidStructure().featurize_dataframe(df=df, col_id="structure", ignore_errors=True)
df_feat = EwaldEnergy()
df = df_feat.featurize_dataframe(df, col_id="structure_oxid", ignore_errors=True)
df.head()

HBox(children=(IntProgress(value=0, description='StructureToOxidStructure', max=229, style=ProgressStyle(descr…




HBox(children=(IntProgress(value=0, description='EwaldEnergy', max=229, style=ProgressStyle(description_width=…




Unnamed: 0,Compound,Label,struct_file_path,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,...,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,composition_oxid,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,structure,structure_oxid,ewald_energy
0,W18O49,0,Metals/W18O49_15254.cif,"(W, O)",8.0,74.0,66.0,25.731343,25.935398,8.0,...,85.272444,12.0,"(W2+, W4+, W6+, O2-)",-2.0,6.0,8.0,5.235056,"[[15.37975385 1.893 6.78296238] W5+, [ ...","[[15.37975385 1.893 6.78296238] W5+, [ ...",-4137.707803
1,CaCrO3,0,Metals/CaCrO3_245840.cif,"(Ca, Cr, O)",8.0,24.0,16.0,13.6,6.72,8.0,...,103.2,12.0,"(Ca2+, Cr4+, O2-)",-2.0,4.0,6.0,3.380617,"[[2.58311502 2.83194208 1.872325 ] Ca2+, [0.0...","[[2.58311502 2.83194208 1.872325 ] Ca2+, [0.0...",-755.310891
2,TiO,0,Metals/TiO_56612.cif,"(Ti, O)",8.0,22.0,14.0,15.0,7.0,8.0,...,91.0,12.0,"(Ti2+, O2-)",-2.0,2.0,4.0,2.828427,"[[0. 0. 0.] Ti2+, [3.38051503e-16 2.10215000e+...","[[0. 0. 0.] Ti2+, [3.38051503e-16 2.10215000e+...",-191.532533
3,SrLaNiO4,0,Metals/LaSrNiO4_CollCode69174.cif,"(Sr, La, Ni, O)",8.0,57.0,49.0,22.142857,16.163265,8.0,...,99.265306,12.0,"(Sr2+, La3+, Ni3+, O2-)",-2.0,3.0,5.0,2.977695,"[[0. 0. 7.92112] Sr2+:0.500, La3+:0....","[[0. 0. 7.92112] Sr2+:0.500, La3+:0....",-408.388135
4,Sr2RuO4,0,Metals/Sr2RuO4_41604.cif,"(Sr, Ru, O)",8.0,44.0,36.0,21.714286,15.673469,8.0,...,99.265306,12.0,"(Sr2+, Ru4+, O2-)",-2.0,4.0,6.0,3.162278,"[[0. 0. 8.24768178] Sr2+, [0. ...","[[0. 0. 8.24768178] Sr2+, [0. ...",-439.970854


In [17]:
# Check how many structures don't have ewald energy calculated
df.loc[df['ewald_energy'].isnull()]

Unnamed: 0,Compound,Label,struct_file_path,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,...,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,composition_oxid,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,structure,structure_oxid,ewald_energy


In [18]:
from matminer.featurizers.structure import StructuralHeterogeneity

df = StructuralHeterogeneity().featurize_dataframe(df=df, col_id="structure", ignore_errors=True)

HBox(children=(IntProgress(value=0, description='StructuralHeterogeneity', max=229, style=ProgressStyle(descri…




In [19]:
# df.to_excel("../data/IMT_Classification_Dataset_structure_features.xlsx")

# Calculate global instability index

In [20]:
test_struct = df['structure_oxid'].iloc[1]
site1 = test_struct[6]

In [24]:
from matminer.featurizers.structure import GlobalInstabilityIndex
df = GlobalInstabilityIndex().featurize_dataframe(df=df, col_id='structure_oxid', ignore_errors=True)
df.rename(columns={'global instability index':'gii'}, inplace=True)

HBox(children=(IntProgress(value=0, description='GlobalInstabilityIndex', max=229, style=ProgressStyle(descrip…


GII extremely large. Table parameters may not be suitable or structure may be unusual.


GII extremely large. Table parameters may not be suitable or structure may be unusual.


GII extremely large. Table parameters may not be suitable or structure may be unusual.







GII extremely large. Table parameters may not be suitable or structure may be unusual.


GII extremely large. Table parameters may not be suitable or structure may be unusual.


GII extremely large. Table parameters may not be suitable or structure may be unusual.



# Handcrafted features

In [26]:
test_comp = df.loc[df['Compound']=='BaTiO3', 'composition_oxid'].values[0]
test_comp

Comp: Ba1 Ti1 O3

In [27]:
test_struct = df.loc[df['Compound']=='BaTiO3', 'structure_oxid'].values[0]
test_struct.get_space_group_info()

('P4/mmm', 123)

In [28]:
test_struct

Structure Summary
Lattice
    abc : 3.9998 3.9998 4.018
 angles : 90.0 90.0 90.0
 volume : 64.28157136072
      A : 3.9998 0.0 2.4491711336147917e-16
      B : 6.432168974176612e-16 3.9998 2.4491711336147917e-16
      C : 0.0 0.0 4.018
PeriodicSite: Ba2+ (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000]
PeriodicSite: Ti4+ (1.9999, 1.9999, 2.0090) [0.5000, 0.5000, 0.5000]
PeriodicSite: O2- (1.9999, 1.9999, 0.0000) [0.5000, 0.5000, 0.0000]
PeriodicSite: O2- (0.0000, 1.9999, 2.0090) [0.0000, 0.5000, 0.5000]
PeriodicSite: O2- (1.9999, 0.0000, 2.0090) [0.5000, 0.0000, 0.5000]

## Identify relevant metal site

In [29]:
def find_metal(structure):
    """Find metal species by electronegativity ranking.
    Args:
        structure: Pymatgen Structure object
    Returns:
        metal: str, name of relevant metal element 
    """
    anions = ['O', 'F', 'N', 'S', 'Se']
    metal = str(structure.composition.element_composition.elements[-2])
    if metal in anions: # If there are two anions return next element
        metal = str(structure.composition.element_composition.elements[-3])
    return metal

In [30]:
msite = find_metal(test_struct)

## Compute metal-ligand and metal-metal neigbor distances

In [31]:
from pymatgen.util import coord
from pymatgen.analysis import bond_valence

def calc_mx_dists(structure, Msite, cutoff=2.8):
    """
    Returns list of metal-anion distances of structure.
    Args:
        structure: Pymatgen Structure object
        Msite: string, name of metal species, e.g. 'Ti'
        cutoff: float, maximum distance for counting as neighbor
    Returns:
        distances: List of metal-anion distances as floats
    """
    distances = []
    # determine Msite and oxygen indexes
    for site in structure.sites:
        if Msite == str(site.species.element_composition.elements[0]):
            neighbors = structure.get_neighbors(site, r = cutoff, include_index=True)
            for neighbor in neighbors:
                elems_on_neighsite = structure.species_and_occu[neighbor[2]].elements
                symbols = [elem.symbol for elem in elems_on_neighsite]
                if Msite in symbols:
                    continue
                else:
                    distances.append(neighbor[1])
    return np.round(distances, decimals=3)


def calc_mm_dists(structure, Msite, cutoff=5):
    """
    Returns list of metal-metal distances of structure.
    Args:
        structure: Pymatgen Structure object
        Msite: string, name of metal species, e.g. 'Ti'
        cutoff: float, maximum distance for counting as neighbor
    Returns:
        distances: List of metal-metal distances as floats
    """
    distances = []
    # determine Msite indexes
    for site in structure.sites:
        if Msite == str(site.species.element_composition.elements[0]):
            neighbors = structure.get_neighbors(site, r = cutoff, include_index=True)
            for neighbor in neighbors:
                elems_on_neighsite = structure.species_and_occu[neighbor[2]].elements
                symbols = [elem.symbol for elem in elems_on_neighsite]
                if Msite in symbols:
                    distances.append(neighbor[1])
                else:
                    continue
    return np.round(distances, decimals=3)


def calc_xx_dists(structure, cutoff=3.8):
    """
    Returns list of ligand-ligand distances of structure.
    Args:
        structure: Pymatgen Structure object
        Msite: string, name of ligand species, e.g. 'O'
        cutoff: float, maximum distance for counting as neighbor
    Returns:
        distances: List of metal-metal distances as floats
    """
    elems = [str(x) for x in structure.composition.element_composition.elements]
    anions = ['O', 'F', 'N', 'S', 'Se']
    ligands = [anion for anion in anions if anion in elems]
    distances = []
    # determine ligand indexes
    for site in structure.sites:
        for ligand in ligands:
            if ligand == str(site.species.element_composition.elements[0]):
                neighbors = structure.get_neighbors(site, r = cutoff, include_index=True)
                for neighbor in neighbors:
                    elems_on_neighsite = structure.species_and_occu[neighbor[2]].elements
                    symbols = [elem.symbol for elem in elems_on_neighsite]
                    if symbols[0] in ligands:
                        distances.append(neighbor[1])
                    else:
                        continue
    return np.round(distances, decimals=3)

In [32]:
print((calc_mm_dists(test_struct, Msite=msite)))
print((calc_mx_dists(test_struct, Msite=msite)))
print((calc_xx_dists(test_struct)))

[4.018 4.    4.    4.018 4.    4.   ]
[2.009 2.    2.    2.009 2.    2.   ]
[2.835 2.835 2.835 2.835 2.835 2.835 2.835 2.835 2.835 2.828 2.835 2.828
 2.835 2.828 2.835 2.828 2.835 2.828 2.835 2.835 2.828 2.835 2.828 2.828]


## Madelung site potentials

In [33]:
from pymatgen.analysis.ewald import EwaldSummation
def calc_potentials(struct_oxid):
    '''Determines the site Madelung energies for the Msite and Xsite.
    Args:
        struct_oxid: Pymatgen structure object with oxidation states
    Returns:
        List of metal-metal and metal-anion potentials
    '''

    # Determine M and X
    metal = find_metal(struct_oxid)
    anion = str(struct_oxid.composition.element_composition.elements[-1])
    M_indices = []
    X_indices = []

    # calculate Madelung energy
    ews = EwaldSummation(struct_oxid)
    site_energies = np.array([])
    site_energies = sum(ews.total_energy_matrix)
    
    
    for index in range(len(struct_oxid.sites)):
        if metal in str(struct_oxid[index].species.element_composition.elements[0]):
            M_indices.append(index)
        elif anion in str(struct_oxid[index].species.element_composition.elements[0]):
            X_indices.append(index)
    

    # max V_mad_M and min V_mad_O gives the min CT energy
    # We convert from eV to V by dividing by oxstate and removing double-counting correction, i.e. multiply by 2
    max_m_energy = max(site_energies[M_indices])
    max_m_index = np.where(site_energies == max_m_energy)[0][0]
    metal_oxi_state = struct_oxid[max_m_index].species.elements[0].oxi_state
    V_mad_M = max_m_energy/metal_oxi_state*2
    V_mad_X = min(-site_energies[X_indices])
    
    return [V_mad_M, V_mad_X]

In [34]:
calc_potentials(test_struct)

[-44.49288844392711, 23.135156382946764]

## Lookup ionization energies

In [35]:
def compute_metal_valences(structure):
    """Returns valences of metal sites in structure.
    Args:
        structure: Pymatgen Structure object
    Returns:
        valences: list of floats, valences of all metal species
    """
    metal = find_metal(structure)
    valences = []
    for site in structure:
        if str(site.species.element_composition.elements[0]) == metal:
            valences.append(site.species.elements[0].oxi_state)
    return valences
print(min(compute_metal_valences(test_struct)))

4.0


In [36]:
def lookup_ionizations(structure):
    """Compute first and second ionization energies for metal species with lowest valence
    Args:
        structure: Pymatgen Structure object
    Returns:
        Tuple with 1st and 2nd ionization energies
    """
    metal = find_metal(structure)
    valences = compute_metal_valences(structure)
    min_valence = min(valences)
    msites = pd.read_csv("../data/B-site.csv")
    sample = msites.loc[(msites['Element'] == metal) & (msites['Formal.ox.state'] == min_valence)]
    try:
        if sample.empty:
            n = float(str(min_valence-int(min_valence))[1:]) # Find decimal values
            sample1 = msites.loc[(msites['Element'] == metal) & (msites['Formal.ox.state'] == min_valence-n)]
            sample2 = msites.loc[(msites['Element'] == metal) & (msites['Formal.ox.state'] == min_valence+1-n)]
            ie1 = sample1['IE1'].values[0]*(1-n) + sample2['IE1'].values[0]*(n)
            ie2 = sample1['IE2'].values[0]*(1-n) + sample2['IE2'].values[0]*(n)
        else:
            ie1 = sample['IE1'].values[0]
            ie2 = sample['IE2'].values[0]
    except: 
        print('Error finding info for element {} with valence {}'.format(metal, min_valence))
    return (ie1, ie2)

In [37]:
lookup_ionizations(test_struct)

(43.27, 99.3)

## Calculate theoretical estimates of Hubbard U and Charge Transfer Gap

In [38]:
def calc_hubu(ie1, ie2, mm_avg):
    """Compute ionic model estimate of Hubbard U
    Args:
        ie1: Float, ionization energy of metal in its ox state
        ie2: Float, ionization energy of metal in ox state +1
        mm_avg: Float, average nearest neighbor metal-metal distance
    Returns:
        esthubu: Float, ionic model estimate of Hubbard U
    """
    conversion_factor = 14.39965 # Conversion factor to match results in Torrance et al.
    esthubu =  ie2 - ie1 - conversion_factor / mm_avg
    return esthubu

In [39]:
ie1, ie2 = lookup_ionizations(test_struct)
mm_avg = np.average(calc_mm_dists(test_struct, find_metal(test_struct)))
calc_hubu(ie1, ie2, mm_avg)

52.43547928107838

In [40]:
def calc_ct(ie1, v_mm, v_mx, mx_avg, anion='O'):
    """Compute ionic model estimate of charge transfer gap
    Args:
        ie1: Float, ionization energy of metal in its ox state
        v_mm: Float, Madelung site potential for metal atoms
        v_mx: Float, Madelung site potential for anion atoms
        mx_avg: Float, average nearest neighbor metal-anion distance
    Returns:
        estct: Float, ionic model estimate of charge transfer gap
    """
    conversion_factor = 14.39965 # Conversion factor to match results in Torrance et al.
    ie_x = {'O': 7.71, 'S': 4.73, 'N': 6.98}[anion] # ionization energy for anion 2-
    estct = -(v_mm - v_mx) - ie_x - ie1 - conversion_factor / mx_avg
    return estct

In [41]:
v_mm, v_mx = calc_potentials(test_struct)
print('{}\n{}'.format(v_mm, v_mx))
mx_avg = np.round(np.average(calc_mx_dists(test_struct, Msite=find_metal(test_struct))), decimals=3)
print(mx_avg)
anion = str(test_struct.composition.element_composition.elements[-1])
calc_ct(ie1=ie1, v_mm=v_mm, v_mx=v_mx, mx_avg=mx_avg, anion=anion)

-44.49288844392711
23.135156382946764
2.003


9.459003389030638

In [42]:
def volume_per_atom(struct):
    """Volume per atom calculator."""
    return struct.volume / struct.num_sites

In [43]:
def featurize_structure(struct):
    """Returns a list of features for the structure.
    Args:
        struct: Pymatgen structure with oxidation states provided
    Returns:
        features: List of features for structure
    """
    features = []
    if struct.is_ordered:
        features.append(0)
    else:
        features.append(1)
    metal = find_metal(structure=struct)
    
    try:
        mx_dists = calc_mx_dists(structure=struct, Msite=metal)
        mx_dists = [np.max(mx_dists), np.min(mx_dists), np.round(np.average(mx_dists), 3)]
    except:
        mx_dists = [None, None, None]
    features.extend(mx_dists)
        
    try:
        mm_dists = calc_mm_dists(structure=struct, Msite=metal)
        mm_dists = [np.max(mm_dists), np.min(mm_dists), np.round(np.average(mm_dists), 3)]
    except:
        mm_dists = [None, None, None]
    features.extend(mm_dists)
    
    try:
        xx_dists = calc_xx_dists(structure=struct)
        xx_dists = [np.max(xx_dists), np.min(xx_dists), np.round(np.average(xx_dists), 3)]
    except:
        xx_dists = [None, None, None]
    features.extend(xx_dists)
    
    try:
        v_mad_m, v_mad_x = calc_potentials(struct)
    except:
        v_mad_m, v_mad_x = (None, None)
    features.extend([v_mad_m, v_mad_x])
    
    try:
        ie1, ie2 = lookup_ionizations(structure=struct)
    except:
        ie1, ie2 = (None, None)
    try:
        esthubu = calc_hubu(ie1, ie2, mm_avg=mm_dists[2])
    except:
        esthubu = None
    try:
        estct = calc_ct(ie1, v_mad_m, v_mad_x, mx_avg=mx_dists[2])
    except:
        estct = None

    features.extend([ie1, ie2])
    features.append(esthubu)
    features.append(estct)
        
    features.append(volume_per_atom(struct))
        
    return features

In [44]:
featurize_structure(test_struct)

[0,
 2.009,
 2.0,
 2.003,
 4.018,
 4.0,
 4.006,
 2.835,
 2.828,
 2.833,
 -44.49288844392711,
 23.135156382946764,
 43.27,
 99.3,
 52.43547928107838,
 9.459003389030638,
 12.856314272144]

# Add handbuilt features to dataframe

In [45]:
labels = ['struct_disordered', 'd_mx_max', 'd_mx_min', 'd_mx_avg',
          'd_mm_max', 'd_mm_min', 'd_mm_avg', 
          'd_xx_max', 'd_xx_min', 'd_xx_avg',
          'v_mad_m', 'v_mad_x', 'ie1', 'ie2',
          'esthubu', 'estct', 'vol_per_atom']
entries = df['structure_oxid'].values

In [46]:
tqdm_func = tqdm.tqdm_notebook
entries = tqdm_func(list(entries), desc='Handbuilt featurizer')
features = [featurize_structure(x) for x in entries]

HBox(children=(IntProgress(value=0, description='Handbuilt featurizer', max=229, style=ProgressStyle(descripti…

Error finding info for element Sn with valence 4.0
Error finding info for element Pr with valence 4.0
Error finding info for element Ce with valence 4.0
Error finding info for element Sn with valence 4.0
Error finding info for element Cd with valence 2.0
Error finding info for element Sn with valence 4.0
Error finding info for element Cd with valence 2.0
Error finding info for element Ce with valence 4.0
Error finding info for element Yb with valence 2.0
Error finding info for element Sn with valence 4.0
Error finding info for element Pr with valence 4.0



invalid value encountered in double_scalars



Error finding info for element V with valence 0.0
Error finding info for element Ti with valence 0.0



In [47]:
res = pd.DataFrame(features, index=df.index, columns=labels)

In [48]:
new = pd.concat([df, res], axis=1)
new = new[df.columns.tolist() + res.columns.tolist()]

In [49]:
new.describe()

Unnamed: 0,Label,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,MagpieData maximum MendeleevNumber,MagpieData range MendeleevNumber,...,d_xx_max,d_xx_min,d_xx_avg,v_mad_m,v_mad_x,ie1,ie2,esthubu,estct,vol_per_atom
count,229.0,229.0,229.0,229.0,229.0,229.0,229.0,229.0,229.0,229.0,...,225.0,225.0,225.0,225.0,227.0,215.0,215.0,215.0,214.0,229.0
mean,1.336245,8.49345,47.943231,39.449782,20.417389,14.201585,8.558952,30.742358,87.074236,56.331878,...,3.388773,2.716516,2.967391,-35.558686,22.148273,34.598487,55.928152,17.468297,8.631931,12.217363
std,0.716685,2.139069,18.957508,19.198258,6.192139,7.16413,2.043876,21.567499,0.278923,21.488754,...,0.327351,0.27894,0.181029,8.756746,4.00324,12.463723,20.142396,12.787829,7.670465,2.863909
min,0.0,3.0,12.0,4.0,9.458333,2.0,8.0,1.0,87.0,6.0,...,2.69,0.321,2.69,-63.152762,-0.465818,11.241,24.915702,6.614026,-26.10129,5.354963
25%,1.0,8.0,26.0,18.0,16.2642,8.16,8.0,8.0,87.0,38.0,...,3.133,2.58,2.861,-42.222966,20.876125,25.563,43.27,12.69913,7.093325,10.109013
50%,1.0,8.0,57.0,49.0,19.0,12.10384,8.0,25.0,87.0,62.0,...,3.454,2.705,2.928,-35.805033,22.907763,35.19,54.92,14.438343,10.166263,11.576092
75%,2.0,8.0,60.0,52.0,24.0,17.76,8.0,49.0,87.0,79.0,...,3.684,2.802,3.011,-30.653074,24.356208,42.36,59.048095,16.614785,13.536702,13.279113
max,2.0,16.0,83.0,75.0,45.0,37.0,16.0,81.0,89.0,86.0,...,3.799,3.636,3.636,3.312863,30.64387,90.63,166.767,118.291186,20.027538,24.879829


In [50]:
list(new)

['Compound',
 'Label',
 'struct_file_path',
 'composition',
 'MagpieData minimum Number',
 'MagpieData maximum Number',
 'MagpieData range Number',
 'MagpieData mean Number',
 'MagpieData avg_dev Number',
 'MagpieData mode Number',
 'MagpieData minimum MendeleevNumber',
 'MagpieData maximum MendeleevNumber',
 'MagpieData range MendeleevNumber',
 'MagpieData mean MendeleevNumber',
 'MagpieData avg_dev MendeleevNumber',
 'MagpieData mode MendeleevNumber',
 'MagpieData minimum AtomicWeight',
 'MagpieData maximum AtomicWeight',
 'MagpieData range AtomicWeight',
 'MagpieData mean AtomicWeight',
 'MagpieData avg_dev AtomicWeight',
 'MagpieData mode AtomicWeight',
 'MagpieData minimum MeltingT',
 'MagpieData maximum MeltingT',
 'MagpieData range MeltingT',
 'MagpieData mean MeltingT',
 'MagpieData avg_dev MeltingT',
 'MagpieData mode MeltingT',
 'MagpieData minimum Column',
 'MagpieData maximum Column',
 'MagpieData range Column',
 'MagpieData mean Column',
 'MagpieData avg_dev Column',
 'Mag

In [51]:
new.drop(columns=['composition', 'composition_oxid', 'structure', 'structure_oxid']).to_excel(
    "../data/IMT_Classification_Dataset_matminer_and_handbuilt_v2.xlsx", index=False)
# new.drop(columns=['composition', 'composition_oxid', 'structure', 'structure_oxid']).to_excel(
#     "../data/MoON_data.xlsx", index=False)