# Collect feature data into master dataframe
---

### Import Modules

In [1]:
import os
print(os.getcwd())
import sys
import time; ti = time.time()

import pickle
from itertools import combinations
from collections import Counter
from functools import reduce

from IPython.display import display

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
# pd.set_option('display.max_rows', None)
pd.options.display.max_colwidth = 100

# #########################################################
from methods import (
    get_df_job_ids,
    get_df_atoms_sorted_ind,
    get_df_jobs_paths,
    get_df_dft,
    get_df_octa_vol,
    get_df_eff_ox,
    get_df_angles,
    get_df_pdos_feat,
    get_df_bader_feat,
    get_df_octa_steric,
    get_df_octa_steric_init,
    get_df_coord,
    )

/mnt/f/Dropbox/01_norskov/00_git_repos/PROJ_IrOx_OER/workflow/feature_engineering


In [2]:
from methods import isnotebook    
isnotebook_i = isnotebook()
if isnotebook_i:
    from tqdm.notebook import tqdm
    verbose = True
else:
    from tqdm import tqdm
    verbose = False

### Read feature dataframes

In [3]:
# Base dataframes
df_dft = get_df_dft()

df_job_ids = get_df_job_ids()

df_atoms_sorted_ind = get_df_atoms_sorted_ind()

df_jobs_paths = get_df_jobs_paths()

# Features dataframes
df_octa_vol = get_df_octa_vol()

df_eff_ox = get_df_eff_ox()

df_angles = get_df_angles()

df_pdos_feat = get_df_pdos_feat()

df_bader_feat = get_df_bader_feat()

In [4]:
# COMBAK

from methods import get_df_octa_info

df_octa_info = get_df_octa_info()

In [5]:
df_octa_steric = get_df_octa_steric()
df_octa_steric_init = get_df_octa_steric_init()

### Filtering down to `oer_adsorbate` jobs

In [6]:
df_ind = df_atoms_sorted_ind.index.to_frame()
df_atoms_sorted_ind = df_atoms_sorted_ind.loc[
    df_ind[df_ind.job_type == "oer_adsorbate"].index
    ]
df_atoms_sorted_ind = df_atoms_sorted_ind.droplevel(level=0)

In [7]:
from local_methods import combine_dfs_with_same_cols

df_dict_i = {
    "df_eff_ox": df_eff_ox,
    "df_octa_vol": df_octa_vol,
    "df_angles": df_angles,
    "df_pdos_feat": df_pdos_feat,
    "df_bader_feat": df_bader_feat,
    "df_octa_steric": df_octa_steric,
    }

df_features = combine_dfs_with_same_cols(
    df_dict=df_dict_i,
    verbose=verbose,
    )

--------------------
col_i: job_id_max
--------------------
col_i: from_oh
--------------------
col_i: active_site
--------------------
col_i: compenv
--------------------
col_i: slab_id
--------------------
col_i: ads
--------------------
col_i: att_num

repated_cols_that_are_identical:
['job_id_max', 'from_oh', 'active_site', 'compenv', 'slab_id', 'ads', 'att_num']


### Adding in bulk data

In [8]:
def method(row_i):
    new_column_values_dict = {
        "dH_bulk": None,
        "volume_pa": None,
        "bulk_oxid_state": None,
        }


    # #####################################################
    slab_id_i = row_i.name[1]
    # #####################################################
    bulk_ids = df_job_ids[df_job_ids.slab_id == slab_id_i].bulk_id.unique()
    mess_i = "ikjisdjf"
    assert len(bulk_ids) == 1, mess_i
    bulk_id_i = bulk_ids[0]
    # #####################################################

    # #####################################################
    row_dft_i = df_dft.loc[bulk_id_i]
    # #####################################################
    dH_i = row_dft_i.dH
    volume_pa = row_dft_i.volume_pa
    stoich_i = row_dft_i.stoich
    # #####################################################

    if stoich_i == "AB2":
        bulk_oxid_state_i = +4
    elif stoich_i == "AB3":
        bulk_oxid_state_i = +6
    else:
        print("Uh oh, couldn't parse bulk stoich, not good")

    # #####################################################
    new_column_values_dict["dH_bulk"] = dH_i
    new_column_values_dict["volume_pa"] = volume_pa
    new_column_values_dict["bulk_oxid_state"] = bulk_oxid_state_i
    # #####################################################
    for key, value in new_column_values_dict.items():
        row_i[("features", key)] = value
    return(row_i)

df_features = df_features.apply(method, axis=1)
df_features = df_features.reindex(columns = ["data", "features", ], level=0)

In [9]:
if verbose:
    print("df_features.shape:", df_features.shape)

# df_features.head()

df_features.shape: (2818, 27)


### Adding magmom data (Spin)

In [10]:
data_dict_list = []
index_list = []
for i_cnt, (name_i, row_i) in enumerate(df_features.iterrows()):
    index_list.append(name_i)
    name_i_2 = name_i[0:-1]

    # #####################################################
    compenv_i = name_i[0]
    slab_id_i = name_i[1]
    ads_i = name_i[2]
    active_site_i = name_i[3]
    att_num_i = name_i[4]
    from_oh_i = name_i[5]
    # #####################################################
    job_id_max_i = row_i["data"]["job_id_max"]
    # #####################################################

    if ads_i == "o" and not from_oh_i:
        name_new_i = (
            compenv_i, slab_id_i, ads_i, "NaN", att_num_i, )
    else:
        name_new_i = name_i_2


    # #########################################################
    row_paths_i = df_jobs_paths.loc[job_id_max_i]
    # #########################################################
    gdrive_path_i = row_paths_i.gdrive_path
    # #########################################################

    # #####################################################
    row_atoms_i = df_atoms_sorted_ind.loc[name_new_i]
    # #####################################################
    magmoms_i = row_atoms_i.magmoms_sorted_good
    atoms_i = row_atoms_i.atoms_sorted_good
    # #####################################################

    if magmoms_i is None:
        magmoms_i = atoms_i.get_magnetic_moments()

    magmom_active_site_i = magmoms_i[int(active_site_i)]




    init_name_i = (compenv_i, slab_id_i, "o", "NaN", 1)

    df_coord_i = get_df_coord(
        mode='init-slab',
        init_slab_name_tuple=init_name_i,
        )

    row_coord_i = df_coord_i.loc[active_site_i]

    Ir_nn_found = False
    nn_Ir = None
    for nn_i in row_coord_i["nn_info"]:
        symbol_i = nn_i["site"].specie.symbol
        if symbol_i == "Ir":
            nn_Ir = nn_i
            Ir_nn_found = True

    Ir_bader_charge_i = None
    if Ir_nn_found:
        Ir_index = nn_Ir["site_index"]
    else:
        print("Ir not found")

    Ir_magmom_i = magmoms_i[int(Ir_index)]


    # #####################################################
    data_dict_i = dict()
    # #####################################################
    # data_dict_i["magmom_active_site"] = np.abs(magmom_active_site_i)
    data_dict_i["O_magmom"] = np.abs(magmom_active_site_i)
    data_dict_i["Ir_magmom"] = np.abs(Ir_magmom_i)
    # #####################################################
    data_dict_list.append(data_dict_i)
    # #####################################################




# #########################################################
df_magmom_i = pd.DataFrame(
    data_dict_list,
    index=pd.MultiIndex.from_tuples(
        index_list,
        names=list(df_features.index.names),
        )
    )

# Add level to column index to match `df_features`
new_cols = []
for col_i in df_magmom_i.columns:
    new_col_i = ("features", col_i)
    new_cols.append(new_col_i)
df_magmom_i.columns = pd.MultiIndex.from_tuples(new_cols)

df_features = pd.concat([
    df_magmom_i,
    df_features,
    ], axis=1)

df_features = df_features.reindex(
    columns=list(df_features.columns.levels[0]),
    level=0)
# #########################################################

### Adding octahedra info from `df_octa_info`

In [11]:
def method(row_i):
    new_column_values_dict = {
        "oxy_opp_as_bl": None,
        }


    # row_i = df_features.iloc[0]

    compenv_i = row_i.name[0]
    slab_id_i = row_i.name[1]
    ads_i = row_i.name[2]
    active_site_i = row_i.name[3]
    att_num_i = row_i.name[4]
    from_oh_i = row_i.name[5]

    name_octa_info_i = ("final", compenv_i, slab_id_i,
        ads_i, active_site_i, att_num_i, from_oh_i, )

    row_octa_info = df_octa_info.loc[name_octa_info_i]

    oxy_opp_as_bl_i = row_octa_info.oxy_opp_as_bl
    degrees_off_of_straight__as_opp = row_octa_info.degrees_off_of_straight__as_opp

    # #####################################################
    new_column_values_dict["oxy_opp_as_bl"] = oxy_opp_as_bl_i
    new_column_values_dict["degrees_off_of_straight__as_opp"] = degrees_off_of_straight__as_opp
    # #####################################################
    for key, value in new_column_values_dict.items():
        row_i[("features", key)] = value
    return(row_i)

df_features = df_features.apply(method, axis=1)
df_features = df_features.reindex(columns = ["data", "features", ], level=0)

In [12]:
# df_octa_info
# df_features.columns

df_features[("features", "as_ir_opp_bl_ratio")] = \
    df_features.features.active_o_metal_dist / df_features.features.oxy_opp_as_bl

In [13]:
df_features

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,data,data,data,data,data,data,data,data,data,data,data,data,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,active_site,ads,att_num,compenv,from_oh,job_id_max,slab_id,active_site_orig,found_active_Ir,num_missing_Os,orig_slab_good,used_unrelaxed_df_coord,O_magmom,Ir_magmom,Ir*O_bader,Ir_bader,O_bader,active_o_metal_dist,angle_O_Ir_surf_norm,closest_Ir_dist,closest_O_dist,effective_ox_state,ir_o_mean,ir_o_std,octa_vol,p_band_center,dH_bulk,volume_pa,bulk_oxid_state,oxy_opp_as_bl,degrees_off_of_straight__as_opp,as_ir_opp_bl_ratio
compenv,slab_id,ads,active_site,att_num,from_oh,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2
nersc,buvivore_13,o,38.0,1,False,38.0,o,1,nersc,False,dadolita_33,buvivore_13,,True,0.0,True,False,0.695,0.697,,,,1.782250,1.087630,3.686693,3.267705,5.666667,1.998366,0.103911,10.178232,,-0.568450,12.531646,4,2.104219,11.068923,0.846989
nersc,buvivore_13,o,38.0,1,True,38.0,o,1,nersc,True,fidivuwi_89,buvivore_13,38,True,0.0,True,False,0.696,0.730,,,,1.782397,1.456500,3.686693,3.267705,5.666667,1.999222,0.104375,10.174465,,-0.568450,12.531646,4,2.105314,11.713935,0.846618
nersc,buvivore_13,oh,38.0,0,True,38.0,oh,0,nersc,True,nipidida_98,buvivore_13,38,True,0.0,True,False,0.124,0.405,,,,1.924408,1.782806,3.633028,3.224902,5.666667,2.002212,0.051808,10.295925,,-0.568450,12.531646,4,2.027490,9.275869,0.949158
nersc,buvivore_13,oh,38.0,1,True,38.0,oh,1,nersc,True,kulurono_32,buvivore_13,38,True,0.0,True,False,0.125,0.407,,,,1.924107,1.212678,3.646938,3.231333,5.666667,2.002121,0.051862,10.296466,,-0.568450,12.531646,4,2.027234,9.662194,0.949129
nersc,buvivore_13,oh,38.0,2,True,38.0,oh,2,nersc,True,sipobodi_93,buvivore_13,38,True,0.0,True,False,0.126,0.293,,,,1.924703,2.762618,3.700919,3.225617,5.666667,2.003249,0.050612,10.297750,,-0.568450,12.531646,4,2.027623,13.347782,0.949241
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
slac,wufulafe_03,oh,57.0,3,True,57.0,oh,3,slac,True,bureruba_01,wufulafe_03,57,True,0.0,True,False,0.177,0.300,,,,1.954241,32.710859,3.498887,2.714694,6.333333,1.951573,0.046556,9.765829,,-0.700424,11.389867,4,2.021335,4.054109,0.966807
slac,wufulafe_03,oh,58.0,0,True,58.0,oh,0,slac,True,sudubuwe_77,wufulafe_03,58,True,0.0,True,False,0.027,0.001,,,,1.929257,17.238952,3.612560,3.418859,6.333333,1.968887,0.059495,9.983895,,-0.700424,11.389867,4,1.999236,5.588101,0.964997
slac,wufulafe_03,oh,58.0,1,True,58.0,oh,1,slac,True,gavenumu_58,wufulafe_03,58,True,0.0,True,False,0.051,0.046,,,,1.930246,14.452806,3.695557,3.435442,6.333333,1.968465,0.059521,9.976544,,-0.700424,11.389867,4,2.008586,4.970114,0.960998
slac,wufulafe_03,oh,58.0,2,True,58.0,oh,2,slac,True,meweduse_57,wufulafe_03,58,True,0.0,True,False,0.040,0.010,,,,1.928307,13.779092,3.739121,3.486122,6.333333,1.967395,0.063467,9.958193,,-0.700424,11.389867,4,2.014683,3.963043,0.957127


In [14]:
# assert False

### Save data to pickle

In [15]:
root_path_i = os.path.join(
    os.environ["PROJ_irox_oer"],
    "workflow/feature_engineering")

# Pickling data ###########################################
directory = os.path.join(root_path_i, "out_data")
if not os.path.exists(directory): os.makedirs(directory)
path_i = os.path.join(root_path_i, "out_data/df_features.pickle")
with open(path_i, "wb") as fle:
    pickle.dump(df_features, fle)
# #########################################################

# #########################################################
import pickle; import os
with open(path_i, "rb") as fle:
    df_features = pickle.load(fle)
# #########################################################

In [16]:
from methods import get_df_features
get_df_features().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,data,data,data,data,data,data,data,data,data,data,data,data,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,active_site,ads,att_num,compenv,from_oh,job_id_max,slab_id,active_site_orig,found_active_Ir,num_missing_Os,orig_slab_good,used_unrelaxed_df_coord,O_magmom,Ir_magmom,Ir*O_bader,Ir_bader,O_bader,active_o_metal_dist,angle_O_Ir_surf_norm,closest_Ir_dist,closest_O_dist,effective_ox_state,ir_o_mean,ir_o_std,octa_vol,p_band_center,dH_bulk,volume_pa,bulk_oxid_state,oxy_opp_as_bl,degrees_off_of_straight__as_opp,as_ir_opp_bl_ratio
compenv,slab_id,ads,active_site,att_num,from_oh,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2
nersc,buvivore_13,o,38.0,1,False,38.0,o,1,nersc,False,dadolita_33,buvivore_13,,True,0.0,True,False,0.695,0.697,,,,1.78225,1.08763,3.686693,3.267705,5.666667,1.998366,0.103911,10.178232,,-0.56845,12.531646,4,2.104219,11.068923,0.846989
nersc,buvivore_13,o,38.0,1,True,38.0,o,1,nersc,True,fidivuwi_89,buvivore_13,38.0,True,0.0,True,False,0.696,0.73,,,,1.782397,1.4565,3.686693,3.267705,5.666667,1.999222,0.104375,10.174465,,-0.56845,12.531646,4,2.105314,11.713935,0.846618
nersc,buvivore_13,oh,38.0,0,True,38.0,oh,0,nersc,True,nipidida_98,buvivore_13,38.0,True,0.0,True,False,0.124,0.405,,,,1.924408,1.782806,3.633028,3.224902,5.666667,2.002212,0.051808,10.295925,,-0.56845,12.531646,4,2.02749,9.275869,0.949158
nersc,buvivore_13,oh,38.0,1,True,38.0,oh,1,nersc,True,kulurono_32,buvivore_13,38.0,True,0.0,True,False,0.125,0.407,,,,1.924107,1.212678,3.646938,3.231333,5.666667,2.002121,0.051862,10.296466,,-0.56845,12.531646,4,2.027234,9.662194,0.949129
nersc,buvivore_13,oh,38.0,2,True,38.0,oh,2,nersc,True,sipobodi_93,buvivore_13,38.0,True,0.0,True,False,0.126,0.293,,,,1.924703,2.762618,3.700919,3.225617,5.666667,2.003249,0.050612,10.29775,,-0.56845,12.531646,4,2.027623,13.347782,0.949241


In [17]:
# #########################################################
print(20 * "# # ")
print("All done!")
print("Run time:", np.round((time.time() - ti) / 60, 3), "min")
print("collect_feature_data.ipynb")
print(20 * "# # ")
# #########################################################

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
All done!
Run time: 1.597 min
collect_feature_data.ipynb
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
