# Parsing Chris's DFT Data on NERSC
---


Author(s): Raul A. Flores

# Notes
---

# Import Modules

In [1]:
import os
import sys

sys.path.insert(0, os.path.join(
    os.environ["PROJ_irox"],
    "chris_prototypes_structures/oqmd_iro3",
    ))

sys.path.insert(0, os.path.join(
    os.environ["PROJ_irox"],
    "data",
    ))

import csv
import pickle

import numpy as np
import pandas as pd

from ase import io
from ase.visualize import view

import plotly.plotly as py
import plotly.graph_objs as go

# #############################################################################
# #############################################################################
from IPython.display import display

# Script Inputs

In [2]:
root_dir = "/mnt/f/GDrive/norskov_research_storage/00_projects/PROJ_irox_2/chris_nersc_files"

# Sandbox
mu_O = -4.523295 + 0.5
mu_Ir = -10.25489552061731 - 0.1

# From Pourbaix Scripts
# mu_O = -4.18119602
# mu_Ir = -8.860644725

# OQMD references
# mu_O = -4.523295
# mu_Ir = -8.855

# Parse ID List from Files from Chris

In [3]:
file_path_i = os.path.join(
    os.environ["PROJ_irox"],
    "data/ml_irox_data/iro2_training_data.csv")
train_data_iro2 = pd.read_csv(file_path_i)
train_data_iro2.set_index("id", inplace=True)

file_path_i = os.path.join(
    os.environ["PROJ_irox"],
    "data/ml_irox_data/iro3_training_data.csv")
train_data_iro3 = pd.read_csv(file_path_i)
train_data_iro3.set_index("id", inplace=True)

train_data_dict = {
    "iro2": train_data_iro2,
    "iro3": train_data_iro3,
    }

# Parse NERSC DFT Data

Comment out to read pickled data file instead (saves time)

In [4]:
# master_data_list = []

# id_list_nersc = []
# for subdir, dirs, files in os.walk(root_dir):
#     if "gas_references" in subdir:
#         continue
#     if "IrO2/Old_ML_calcs" in subdir:
#         continue
#     if "__old__" in subdir:
#         continue
#     if "volume" in subdir:
#         continue
        
#     if "OUTCAR" in files:
#         print(subdir[81:])

#         dir_i = subdir

#         id_i = int(subdir.split("/")[-1].split("_")[0])

#         try:
#             atoms_i = io.read(os.path.join(dir_i, "OUTCAR"))
#             energy_i = atoms_i.get_potential_energy()
#         except:
#             atoms_i = None

#         path_short_i = dir_i.replace("/mnt/f/GDrive/norskov_research_storage/00_projects/PROJ_irox_2/chris_nersc_files/", "")

#         dict_i = {
#             "id": id_i,
#             "atoms": atoms_i,
#             "energy": energy_i,
#             "path": path_short_i,
#             }

#         master_data_list.append(dict_i)

# directory = "out_data"
# if not os.path.exists(directory):
#     os.makedirs(directory)
# # Save Data
# with open("parse_data.pickle", "wb") as fle:
#     pickle.dump(master_data_list, fle)

In [5]:
with open("out_data/parse_data.pickle", "rb") as fle:
    master_data_list = pickle.load(fle)

df = pd.DataFrame(master_data_list)

# Process dataframe

In [6]:
def method(row_i):
    """
    """
    if "IrO2" in row_i["path"]:
        sys_i = "AB2"
    elif "IrO3" in row_i["path"]:
        sys_i = "AB3"
    else:
        sys_i = None
    return(sys_i)

df["stoich"] = df.apply(
    method,
    axis=1,
    )

# #############################################################################
# #############################################################################

def method(row_i):
    """
    """
    ignore_list = [
        "IrO3/winnersIrO3",
        "IrO3/full_relax",
        "IrO3/full_relax_ML1",
        "IrO3/full_relax_ML2",
        "IrO3/full_relax_ML3",
        "IrO3/full_relax_ML4",
        "IrO3/single_point",
        "IrO3/volume_relax",
        "IrO3/volume_relax_ML1",
        "IrO3/volume_relax_ML2",
        "IrO3/volume_relax_ML3",
        "IrO3/volume_relax_ML4",
        ]
    ignore = False
    for ignore_seg_i in ignore_list:
        if ignore_seg_i in row_i["path"]:
            ignore = True

    return(ignore)

# #############################################################################
# #############################################################################

df["ignore_tag"] = df.apply(
    method,
    axis=1,
    )

def method(row_i):
    """
    """
    if "volume" in row_i["path"]:
        out = True
    else:
        out = False
    return(out)

df["volume_tag"] = df.apply(
    method,
    axis=1,
    )

# #############################################################################
# #############################################################################

def method(row_i):
    """
    """    
    atoms_i = row_i["atoms"]
    
    if atoms_i is None:
        return(None)
    else:
        num_atoms_i = len(atoms_i.get_atomic_numbers())
        energy_norm_i = row_i["energy"] / num_atoms_i

        return(energy_norm_i)

df["energy_pa"] = df.apply(
    method,
    axis=1,
    )



def method(row_i):
    """
    """    
    atoms_i = row_i["atoms"]

    if atoms_i is not None:
        forces = atoms_i.get_forces()
        forces_sum = 0.0
        forces_largest = 0.0
        for a in range(len(atoms_i)):
            force_i = np.sqrt(forces[a][0] ** 2 + forces[a][1] ** 2 + forces[a][2] ** 2)
            forces_sum += force_i
            if(force_i > forces_largest):
                forces_largest = force_i

        row_i["force_max"] = forces_largest
        row_i["force_sum"] = forces_sum
    else:
        row_i["force_max"] = None
        row_i["force_sum"] = None

    return(row_i)


df = df.apply(
    method,
    axis=1,
    )

# def method(row_i):
#     """
#     """    
#     energy_per_atom_i = row_i["energy_per_atom"]
#     stoich_i = row_i["stoich"]

#     if stoich_i == "AB2":
#         form_e_i = (3 * energy_per_atom_i) - (1. * mu_Ir) - (2. * mu_O)
#         form_e_i = form_e_i / 3
#     elif stoich_i == "AB3":
#         form_e_i = (4 * energy_per_atom_i) - (1. * mu_Ir) - (3. * mu_O)
#         form_e_i = form_e_i / 4
#     else:
#         form_e_i = None

# #     print(form_e_i)

#     return(form_e_i)

# df["form_e"] = df.apply(
#     method,
#     axis=1,
#     )

# #############################################################################
# #############################################################################

In [7]:
print("Number of calculations for IrO3 parsed:", len(df[df["stoich"] == "IrO3"]))

df = df[df["ignore_tag"] == False]
df = df[df["volume_tag"] == False]

# df = df[df["force_max"] < 0.01]
# df = df[df["force_max"] < 0.05]

Number of calculations for IrO3 parsed: 0


# IrO2

In [8]:
# df_iro2 = df[df["stoich"] == "IrO2"]
df_iro2 = df[df["stoich"] == "AB2"]

master_data = []
for id_i, row_i in train_data_dict["iro2"].iterrows():

    if row_i["source"] != "chris":
        continue

    form_e_chris_i = row_i["form_e_chris"]

    df_i = df_iro2[df_iro2["id"] == id_i]
#     display(df_i)

    if len(df_i) == 0:
        print(id_i, " | There are no rows for this id!!!!")

    df_0 = df_i[df_i["path"].str.contains("final_opt_new1-3")]
    df_1 = df_i[df_i["path"].str.contains("final_relax")]

    row_i = None
    if len(df_0) > 0:
        if len(df_0) > 1:
            print("NOOOOOOOOOO!!!!!!!!!")
        row_j = df_0.iloc[0]

    else:
        if len(df_1) > 0:
            if len(df_1) > 1:
                print("NOOOOOOOOOO!!!!!!!!!")
            row_j = df_1.iloc[0]
        else:
            tmp = 42
    
    row_j["form_e_chris"] = form_e_chris_i
    master_data.append(row_j)

df_iro2_unique = pd.concat(master_data, axis=1, sort=True).transpose()
df_iro2_unique.set_index("id", inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



# IrO3

In [9]:
# df_iro3 = df[df["stoich"] == "IrO3"]
df_iro3 = df[df["stoich"] == "AB3"]

master_data = []
for id_i, row_i in train_data_dict["iro3"].iterrows():

    if row_i["source"] != "chris":
        continue

    form_e_chris_i = row_i["form_e_chris"]

    df_i = df_iro3[df_iro3["id"] == id_i]

    if len(df_i) == 0:
        print(id_i, " | There are no rows for this id!!!!")
        row_j = pd.Series({"id": int(id_i)})
    else:
        df_0 = df_i[df_i["path"].str.contains("final_opt_new1-3_sorted")]
        if len(df_0) > 0:
            if len(df_0) > 1:
                print("NOOOOOOOOOO!!!!!!!!!")
            row_j = df_0.iloc[0]
        else:
            row_j = df_i.iloc[0]

    row_j["form_e_chris"] = form_e_chris_i
    master_data.append(row_j)

df_iro3_unique = pd.concat(master_data, axis=1, sort=True).transpose()
df_iro3_unique = df_iro3_unique.astype({"id": int})
df_iro3_unique.set_index("id", inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



# Applying unique IDs

In [10]:
df_iro2_dft = df_iro2_unique
df_iro3_dft = df_iro3_unique

In [11]:
path_i = os.path.join(
    os.environ["PROJ_irox"],
    "data/ml_irox_data",
    "unique_ids.csv")

df_id = pd.read_csv(path_i)

id_mapp_iro2 = dict(zip(
    df_id[df_id["stoich"] == "AB2"]["id"],
    df_id[df_id["stoich"] == "AB2"]["unique_ids"],
    ))

id_mapp_iro3 = dict(zip(
    df_id[df_id["stoich"] == "AB3"]["id"],
    df_id[df_id["stoich"] == "AB3"]["unique_ids"],
    ))

In [12]:
def method(row_i):
    """
    """
    id_i = row_i.name
    if row_i["stoich"] == "IrO2" or row_i["stoich"] == "AB2":
        mapping_dict = id_mapp_iro2
    elif row_i["stoich"] == "IrO3" or row_i["stoich"] == "AB3":
        mapping_dict = id_mapp_iro3
    else:
        print(row_i)

    id_unique_i = mapping_dict[id_i]

    return(id_unique_i)

df = df_iro2_dft
df["id_unique"] = df.apply(
    method,
    axis=1,
    )
df["id_old"] =  df.index.values
df.set_index("id_unique", inplace=True)


df = df_iro3_dft
df["id_unique"] = df.apply(
    method,
    axis=1,
    )
df["id_old"] =  df.index.values
df.set_index("id_unique", inplace=True)

In [13]:
df_dft_calcs = pd.concat([
    df_iro2_dft,
    df_iro3_dft,
    ])

# Strip Unneeded Columns

In [14]:
good_bye_list = [
    "ignore_tag",
    "volume_tag",
    ]
df_dft_calcs.drop(good_bye_list, axis=1, inplace=True)

In [15]:
df_dft_calcs["source"] = "chris"

# Save data to pickle

In [16]:
directory = "out_data"
if not os.path.exists(directory):
    os.makedirs(directory)
with open("out_data/df_dft_calcs.pickle", "wb") as fle:
    pickle.dump(df_dft_calcs, fle)

In [25]:
df_dft_calcs

Unnamed: 0_level_0,atoms,energy,energy_pa,force_max,force_sum,form_e_chris,path,stoich,id_old,source
id_unique,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
z39g648rnl,"(Atom('O', [1.09528, 7.66698, 7.66698], index=...",-150.762,-6.28174,0,0,-0.085637,IrO2/final_opt_new1-3/25_fixed.cif,AB2,25,chris
82mwbwbgbl,"(Atom('O', [7.23772, 7.23732, -0.0], index=0),...",-157.948,-6.58117,0.00683552,0.124672,-0.385069,IrO2/final_opt_new1-3/59_fixed.cif,AB2,59,chris
8495vjvsc3,"(Atom('O', [-0.51619, 6.36938, 1.47681], index...",-59.1215,-6.56905,0.00917233,0.0413081,-0.372952,IrO2/final_opt_new1-3/69_fixed.cif,AB2,69,chris
xonhb1cgnq,"(Atom('Ir', [-2e-05, -3e-05, 2.46526], index=0...",-83.4011,-6.95009,0.00673923,0.0510938,-0.753994,IrO2/final_opt_new1-3/72_fixed.cif,AB2,72,chris
8aza84xlvs,"(Atom('Ir', [0.0, 0.0, -0.0], index=0), Atom('...",-122.502,-6.80567,0.00931162,0.106901,-0.609567,IrO2/final_opt_new1-3/73_fixed.cif,AB2,73,chris
vwxrnun48g,"(Atom('O', [12.87719, 3.08577, 6.17657], index...",-711.04,-6.5837,0.0174474,0.880346,-0.387602,IrO2/final_opt_new1-3/80_fixed.cif,AB2,80,chris
9wzqvtmw6a,"(Atom('O', [0.31715, 5.16398, 10.26108], index...",-161.113,-6.71302,0.00988006,0.128339,-0.516924,IrO2/final_opt_new1-3/83_fixed.cif,AB2,83,chris
c48lx363be,"(Atom('Ir', [0.0, 0.0, 6e-05], index=0), Atom(...",-81.2711,-6.77259,0.006717,0.039109,-0.576493,IrO2/final_opt_new1-3/107_fixed.cif,AB2,107,chris
6fzy7kcrxy,"(Atom('O', [4.28427, 4.28427, 4.28427], index=...",-68.0632,-5.67194,0.00128172,0.0102537,0.524164,IrO2/final_opt_new1-3/152_fixed.cif,AB2,152,chris
ckbg7d9u6u,"(Atom('O', [4.36138, 1.53522, 3.26091], index=...",-161.165,-6.71521,0.00705566,0.0808736,-0.519115,IrO2/final_opt_new1-3/167_fixed.cif,AB2,167,chris


In [18]:
# oqmd_ids = [
#     10054,
#     694981,
#     690788,
#     825007,
#     694982,
#     825035,
#     825034,
#     325288,
#     298609,
#     309147,
#     349774,
#     ]

# for oqmd_id in oqmd_ids:

#     unique_ids_possible = [
#         id_mapp_iro2.get(oqmd_id, None),
#         id_mapp_iro3.get(oqmd_id, None),
#         ]

#     reduced_id_list = [v for v in unique_ids_possible if v is not None]
    
#     if len(reduced_id_list) != 1:
#         print("ERROR | !!!!! | IDJFISDJF")
#     else:
#         print(reduced_id_list[0])

In [19]:
# df_combined_iro2 = pd.concat([df_iro2_unique, train_data_dict["iro2"]], axis=1)


# data = []

# y_array = df_combined_iro2["form_e"]
# trace = go.Scatter(
#     y=y_array,
#     mode="markers",
#     name="Mine",
#     )
# data.append(trace)

# y_array = df_combined_iro2["form_e_chris"]
# trace = go.Scatter(
#     y=y_array,
#     mode="markers",
#     name="Chris",
#     )
# data.append(trace)

# py.iplot(data,
#     filename=os.path.join(
#         "__temp__",
#         "temp_plot_0"
#         )
#     )

In [20]:
# print(
#     "Mean  error: ",
#     (df_combined_iro2["form_e"] - df_combined_iro2["form_e_chris"]).dropna().mean()
#     )

In [21]:
# df_combined_iro3 = pd.concat(
#     [df_iro3_unique, train_data_dict["iro3"]],
#     axis=1)

# data = []

# y_array = df_combined_iro3["form_e"]
# trace = go.Scatter(
#     y=y_array,
#     mode="markers",
#     name="Mine",
#     )
# data.append(trace)

# y_array = df_combined_iro3["form_e_chris"]
# trace = go.Scatter(
#     y=y_array,
#     mode="markers",
#     name="Chris",
#     )
# data.append(trace)

# py.iplot(data,
#     filename=os.path.join(
#         "__temp__",
#         "temp_plot_1"
#         )
#     )

In [22]:
# print(
#     "Mean  error: ",
#     (df_combined_iro3["form_e"] - df_combined_iro3["form_e_chris"]).dropna().mean()
#     )