# Import Modules

In [1]:
import os
import sys

import pickle

import numpy as np
import pandas as pd

# #############################################################################
sys.path.insert(0, os.path.join(os.environ["PROJ_irox"], "data"))

from proj_data_irox import (
    bulk_dft_data_path,
    unique_ids_path,
    prototypes_data_path,
    static_irox_structures_path)

print(os.getcwd())

/mnt/f/Dropbox/01_norskov/00_git_repos/PROJ_IrOx_Active_Learning_OER/workflow/ml_modelling/processing_bulk_dft


# Read Data

In [2]:
path_i = os.path.join(
    os.environ["PROJ_irox"],
    "workflow/ml_modelling",
    "processing_bulk_dft/parse_chris_bulk_dft/out_data",
    "df_dft_calcs.pickle")
with open(path_i, "rb") as fle:
    df_chris = pickle.load(fle)
df_chris["source"] = "chris"

In [3]:
path_i = os.path.join(
    os.environ["PROJ_irox"],
    "workflow/ml_modelling/processing_bulk_dft",
    "parse_my_bulk_dft/out_data",
    "df_bulk_raul_irox.pickle")
with open(path_i, "rb") as fle:
    df_raul_irox = pickle.load(fle)
    
df_raul_irox["source"] = "raul"

In [4]:
path_i = os.path.join(
    os.environ["PROJ_irox"],
    "workflow/ml_modelling/processing_bulk_dft",
    "parse_my_oer_bulk_dft/out_data",
    "df_oer_bulk.pickle")

with open(path_i, "rb") as fle:
    df_oer_bulk = pickle.load(fle)

# df_raul_irox["source"] = "raul"

In [5]:
from proj_data_irox import oqmd_irox_data_path
with open(oqmd_irox_data_path, "rb") as fle:
    df_oqmd_data = pickle.load(fle)

df_oqmd_data = df_oqmd_data.drop(
    labels=[
        "source",
#         "id_unique",
        ],
    axis=1,
    )

df_oqmd_data["source"] = "oqmd"

# Combining Chris and Raul data

In [6]:
frames = [
    df_raul_irox,
    df_oer_bulk,
    df_chris,
    df_oqmd_data,
    ]

df_m = pd.concat(frames)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


# Mapping unique ID scheme

In [7]:
df_id = pd.read_csv(unique_ids_path)

id_mapp_iro2 = dict(zip(
    df_id[df_id["stoich"] == "AB2"]["id"],
    df_id[df_id["stoich"] == "AB2"]["unique_ids"]))

id_mapp_iro3 = dict(zip(
    df_id[df_id["stoich"] == "AB3"]["id"],
    df_id[df_id["stoich"] == "AB3"]["unique_ids"]))

# #############################################################################

def method(row_i):
    """
    """

    if row_i["source"] == "raul_oer":
        id_unique_i = row_i.name
    else:
        id_i = row_i["id_old"]

        if row_i["stoich"] == "IrO2" or row_i["stoich"] == "AB2":
            mapping_dict = id_mapp_iro2
        elif row_i["stoich"] == "IrO3" or row_i["stoich"] == "AB3":
            mapping_dict = id_mapp_iro3
        else:
            print("BAD BAD | Couldn't process id: ", row_i)

        id_unique_i = mapping_dict[id_i]

    return(id_unique_i)

df_m["id_unique"] = df_m.apply(method, axis=1)
df_m.set_index("id_unique", inplace=True)

# Adding energy per atom column

In [8]:
def method(row_i):
    atoms_i = row_i["atoms"]
    energy = None
    if atoms_i is None:
        energy = None
    else:
        try:
            energy = atoms_i.get_potential_energy()
        except:
            energy = None
    return(energy)
df_m["energy"] = df_m.apply(method, axis=1)

def method(row_i):
    energy_norm_i = None

    atoms_i = row_i["atoms"]
    # energy_pa = row_i["energy_pa"]
    energy_pa = row_i.get("energy_pa", np.nan)
    

    if not np.isnan(energy_pa) and row_i["source"] == "oqmd":
        energy_norm_i = energy_pa

    else:
        if atoms_i is None:
            energy_norm_i = None
        else:
            num_atoms_i = len(atoms_i.get_atomic_numbers())
            energy_norm_i = row_i["energy"] / num_atoms_i

    return(energy_norm_i)
df_m["energy_pa"] = df_m.apply(method, axis=1)

# Adding Formation Enthalpy and Gibbs Free Energy

In [9]:
from proj_data_irox import calc_dH


def method(row_i, calc_dH):
    energy_pa = row_i["energy_pa"]
    stoich = row_i["stoich"]
    
    dH = calc_dH(energy_pa, stoich=stoich)
    
    return(dH)

df_m["dH"] = df_m.apply(method, args=(calc_dH, ), axis=1)

In [10]:
# def method(row_i, argument_0, optional_arg=None):
#     """
#     """
#     return(argument_0)

# arg1 = "TEMP_0"
# df_i = model_i
# df_i["column_name"] = df_i.apply(
#     method,
#     axis=1,
#     args=(arg1, ),
#     optional_arg="TEMP_1"
#     )

# Removing rows with missing atoms objects

In [11]:
df_m = df_m[df_m["atoms"].notnull()]

# Count number of atoms

In [12]:
def method(row_i):
    atoms_i = row_i["atoms"]
    num_atoms_i = atoms_i.get_number_of_atoms()
    return(num_atoms_i)


df_i = df_m
df_i["num_atoms"] = df_i.apply(
    method,
    axis=1)
df_m = df_i

# Save data

In [13]:
directory = "out_data"
if not os.path.exists(directory):
    os.makedirs(directory)

with open(os.path.join(directory, "df_bulk_dft.pickle"), "wb") as fle:
    pickle.dump(df_m, fle)

In [14]:
os.path.join(directory, "df_bulk_dft.pickle")

'out_data/df_bulk_dft.pickle'

In [15]:
df_m.loc["cubqbpzd7k"]

atoms           (Atom('O', [3.1853, 4.44115, 0.0], index=0), A...
energy_pa                                                -7.04906
form_e_chris                                                  NaN
id                                                            NaN
id_old                                                        473
path            /global/cscratch1/sd/flores12/IrOx_Project_tem...
source                                                       raul
stoich                                                        AB2
energy                                                    -676.71
dH                                                      -0.839922
num_atoms                                                      96
Name: cubqbpzd7k, dtype: object

In [16]:
# 3 * -7.049 - (2 * -4.657947279999998 + -9.304929736367313)

# -7.04

In [26]:
df_m[
    (df_m["stoich"] == "AB2") & \
#     (df_m["stoich"] == "AB2")
    (df_m["source"] == "raul")
    ].sort_values("energy_pa")

Unnamed: 0_level_0,atoms,energy_pa,form_e_chris,id,id_old,path,source,stoich,energy,dH,num_atoms
id_unique,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
cubqbpzd7k,"(Atom('O', [3.1853, 4.44115, 0.0], index=0), A...",-7.049062,,,473,/global/cscratch1/sd/flores12/IrOx_Project_tem...,raul,AB2,-676.709950,-0.839922,96
6qcdb4bov2,"(Atom('O', [12.46625, 4.81539, 3.98117], index...",-7.048875,,,655,/global/cscratch1/sd/flores12/IrOx_Project_tem...,raul,AB2,-676.691954,-0.839734,96
64cg6j9any,"(Atom('O', [4.43912, 4.78444, 3.20932], index=...",-7.047516,,,177,/scratch/users/flores12/PROJ_irox_ml_oer/ml_bu...,raul,AB2,-169.140375,-0.838375,24
b46enqnq8e,"(Atom('O', [3.2089, 5.18819, 0.0], index=0), A...",-7.047508,,,162,/scratch/users/flores12/PROJ_irox_ml_oer/ml_bu...,raul,AB2,-169.140203,-0.838368,24
9yz2mt8hbh,"(Atom('Ir', [0.0, 0.0, 0.0], index=0), Atom('I...",-7.047426,,,513,/global/cscratch1/sd/flores12/IrOx_Project_tem...,raul,AB2,-84.569115,-0.838286,12
6avov5cy64,"(Atom('Ir', [0.00105, 6.80648, 0.01512], index...",-7.045677,,,175,/global/cscratch1/sd/flores12/IrOx_Project_tem...,raul,AB2,-338.192481,-0.836536,48
vunhmsbrml,"(Atom('Ir', [0.0, 3.40259, 1.78835], index=0),...",-7.045582,,,262,/global/cscratch1/sd/flores12/IrOx_Project_tem...,raul,AB2,-42.273490,-0.836441,6
momkzj9r84,"(Atom('Ir', [0.124, 0.0, 8.32851], index=0), A...",-7.045556,,,476,/nfs/slac/g/suncatfs/flores12/PROJ_irox_ml_oer...,raul,AB2,-84.546674,-0.836416,12
nazu9q9l9h,"(Atom('O', [8.20322, 0.0, 0.86787], index=0), ...",-7.045528,,,676,/global/cscratch1/sd/flores12/IrOx_Project_tem...,raul,AB2,-169.092672,-0.836388,24
m29j648g6i,"(Atom('Ir', [0.0, 0.0, 0.0], index=0), Atom('I...",-7.045449,,,200,/global/cscratch1/sd/flores12/IrOx_Project_tem...,raul,AB2,-42.272695,-0.836309,6
