# Import Modules

In [1]:
import os
print(os.getcwd())

import sys

import pickle

import pandas as pd

import bulk_enumerator as be
import time

from pymatgen.io.vasp.inputs import Poscar
from pymatgen.io.ase import AseAtomsAdaptor

# #############################################################################
sys.path.insert(0, os.path.join(os.environ["PROJ_irox"], "data"))     

from proj_data_irox import (
    bulk_dft_data_path,
    unique_ids_path,
    prototypes_data_path,
    static_irox_structures_path,
    oqmd_irox_data_path,
    )

/home/raulf2012/Dropbox/01_norskov/00_git_repos/PROJ_IrOx_Active_Learning_OER/workflow/ml_modelling/processing_bulk_dft/prototype_classification


# Read Data

In [2]:
# #############################################################################
with open(bulk_dft_data_path, "rb") as fle:
    df_bulk_dft = pickle.load(fle)
# #############################################################################

df_bulk_dft = df_bulk_dft[df_bulk_dft.source == "raul"]

# Classify prototype info

In [3]:
# tolerance = 1e-12
# tolerance = 1e-9
# tolerance = 1e-8
# tolerance = 1e-7
# tolerance = 1e-6
# tolerance = 1e-5
tolerance = 1e-4
# tolerance = 1e-3
# tolerance = 1e-2
# tolerance = 1e-1


In [4]:
t0 = time.time()

data_list = []
# for id_i, row_i in df_bulk_dft.iloc[0:20].iterrows():
for id_i, row_i in df_bulk_dft.iterrows():

    atoms_i = row_i["atoms"]

    structure_i = AseAtomsAdaptor.get_structure(atoms_i)
    poscar_str_i = Poscar(structure_i).get_string()

    b = be.bulk.BULK(
        tolerance=tolerance, 
        )
    b.set_structure_from_file(poscar_str_i)

    spacegroup_i = b.get_spacegroup()
    species_i = b.get_species()
    wyckoff_i = b.get_wyckoff()
    name_i = b.get_name()
    parameter_values_i = b.get_parameter_values()

    primitive_natom = b.get_primitive_natom()
    std_natom = b.get_std_natom()

    row_dict_i = {
        "id": id_i,
        "spacegroup_i": spacegroup_i,
        "species_i": species_i,
        "wyckoff_i": wyckoff_i,
        "name_i": name_i,
        "parameter_values_i": parameter_values_i,
        "primitive_natoms": primitive_natom,
        "std_natom": std_natom,
        }
    data_list.append(row_dict_i)


t1 = time.time()
print("time to complete for loop: ", t1 - t0, "sec")
print("time to complete for loop (per iter): ", (t1 - t0) / len(data_list), "sec")
print("")

df_proto = pd.DataFrame(data_list)
df_proto.set_index("id", inplace=True)

print(
    "Number of entries processed: ",
    len(df_proto["name_i"].to_list())
    )

print(
    "Unique entries (some systems with the same prototype): ", 
    len(set(df_proto["name_i"].tolist())),
    )


time to complete for loop:  73.31834673881531 sec
time to complete for loop (per iter):  0.09961731893860776 sec

Number of entries processed:  736
Unique entries (some systems with the same prototype):  540


In [5]:
num_atoms_removed = (df_proto.std_natom - df_proto.primitive_natoms).sum()

print("num_atoms_removed:", num_atoms_removed)

num_atoms_removed: 5323


In [6]:
df_proto

Unnamed: 0_level_0,spacegroup_i,species_i,wyckoff_i,name_i,parameter_values_i,primitive_natoms,std_natom
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
95c29e9f6h,4,"[Ir, Ir, O, O, O, O]","[a, a, a, a, a, a]",AB2_4_a2_a4_4,"[{'name': 'a', 'value': 3.1577903249242185}, {...",12,12
zh6rc56hzs,1,"[Ir, Ir, Ir, Ir, O, O, O, O, O, O, O, O]","[a, a, a, a, a, a, a, a, a, a, a, a]",AB2_4_a4_a8_1,"[{'name': 'a', 'value': 4.708590000066368}, {'...",12,12
ml6snhmqxe,221,"[Ir, O, O, O]","[k, h, i, m]",AB2_24_k_him_221,"[{'name': 'a', 'value': 13.5968}, {'name': 'yk...",72,72
zy9dzknhnj,74,"[Ir, O, O]","[e, a, c]",AB2_2_e_ac_74,"[{'name': 'a', 'value': 7.28936}, {'name': 'b/...",6,12
ca7u6o9ins,211,"[Ir, O, O]","[i, h, i]",AB2_12_i_hi_211,"[{'name': 'a', 'value': 11.5636}, {'name': 'yi...",36,72
...,...,...,...,...,...,...,...
xuvkcucdzf,6,"[Ir, Ir, Ir, Ir, O, O, O, O, O, O, O, O, O, O,...","[a, a, b, b, a, a, a, a, a, a, b, b, b, b, b, b]",AB3_4_a2b2_a6b6_6,"[{'name': 'a', 'value': 7.846689940905337}, {'...",16,16
zez2zp9o7o,139,"[Ir, O, O, O]","[e, c, d, e]",AB3_2_e_cde_139,"[{'name': 'a', 'value': 4.10895}, {'name': 'b/...",8,16
miceml8yc1,191,"[Ir, O, O, O]","[f, b, c, k]",AB3_3_f_bck_191,"[{'name': 'a', 'value': 7.167609850862035}, {'...",12,12
me8d9sx47e,1,"[Ir, Ir, Ir, Ir, Ir, Ir, Ir, Ir, Ir, Ir, Ir, I...","[a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, ...",AB3_18_a18_a54_1,"[{'name': 'a', 'value': 8.23602}, {'name': 'b/...",72,72


In [7]:
# b.get_name()
# b.get_parameter_gradients()
# b.get_parameter_values()
# b.get_parameters()
# b.get_primitive_natom()
# b.get_primitive_poscar()
# b.get_spacegroup()
# b.get_species()
# b.get_species_permutations()
# b.get_std_natom()
# b.get_std_poscar()
# b.get_wyckoff()
# b.get_wyckoff_list()
# b.get_wyckoff_structure_map()

In [8]:
# b.get_primitive_natom()
# b.get_std_natom()
# b.get_std_poscar()
# b.get_wyckoff()