# Analysing Similarity Matrix for IrOx Systems Post-DFT
---

Systems that have the same d but different energies

list_0 = [
 '8481z1n1na',
 'zr9ic2zaz5',
 '8h9snabqca',
 '7f8pm5mhnu',
 'cgx3mkzhmd',
 'vwxfn3blxi',
 '9obw8dbrvy',
 'bpvynr7p9w',
 '8gnovr727t',


 '9pb4c1927h',
 '8i63m2b5ve',


 'vlxp9abd6h',
 'z2nh817ene',
 'xu6ivyvfvf',
 ]

# Import Modules

In [1]:
import os
import sys

import pickle
import pandas as pd

# #############################################################################
import plotly.graph_objs as go

# #############################################################################
sys.path.insert(0, os.path.join(os.environ["PROJ_irox"], "data"))
from proj_data_irox import (
    static_irox_structures_path,
    bulk_dft_data_path,
    df_dij_path)

from methods import plot_dij_matrix_heatmap
from plotting.my_plotly import my_plotly_plot

# Script Inputs

In [2]:
# d_thresh = 0.075

# d_thresh = 0.01
d_thresh = 0.02
# d_thresh = 0.03
# d_thresh = 0.04
# d_thresh = 0.05
# d_thresh = 0.06
# d_thresh = 0.07
# d_thresh = 0.08
# d_thresh = 0.09
# d_thresh = 0.10
# d_thresh = 0.20
# d_thresh = 0.30
# d_thresh = 0.40
# d_thresh = 0.70


e_thresh = 0.01

create_plot = True

# Read Data

In [3]:
# df_dij_path_tmp = df_dij_path[0:-18] + "df_d_ij_all_temp.pickle"
with open(df_dij_path, "rb") as fle:
# with open(df_dij_path_tmp, "rb") as fle:
    df_dij_dft = pickle.load(fle)
    print("df_dij_dft.shape:", df_dij_dft.shape)

with open(static_irox_structures_path, "rb") as fle:
    df_static_irox = pickle.load(fle)

with open(bulk_dft_data_path, "rb") as fle:
    df_bulk_dft = pickle.load(fle)

path_i = os.path.join(
    os.environ["PROJ_irox"],
    "workflow/ml_modelling",
    "ccf_similarity_analysis/out_data",
    "all_ids_to_elim_1.pickle")
with open(path_i, "rb") as fle:
    ids_to_drop_prev = pickle.load(fle)

ids_to_drop_prev = ids_to_drop_prev["AB2"] + ids_to_drop_prev["AB3"]

# sys.path.insert(0, "../04_final_ml_plots")

df_dij_dft.shape: (1711, 1711)


In [4]:
# # df_dij_dft.loc["8p8evt9pcg", "9lmkmh8s8r"]


# df_dij_dft.loc[

#     "64cg6j9any",
#     "b46enqnq8e",
#     "9yz2mt8hbh",

# #     "6avov5cy64"
    
# #     "clc2b1mavs",
#     ]

# Dropping Static Structure from D_ij

In [5]:
static_ids = df_static_irox["static_id"].tolist()
static_ids_in_dij = [i for i in static_ids if i in df_dij_dft.index]

df_dij_dft = df_dij_dft.drop(labels=static_ids_in_dij, axis=0)
df_dij_dft = df_dij_dft.drop(labels=static_ids_in_dij, axis=1)

# Filtering data to needed systems

In [6]:
df_bulk_dft = df_bulk_dft[
    (df_bulk_dft["source"] != "chris") &
    (df_bulk_dft["source"] != "oqmd") &
    [True for i in range(len(df_bulk_dft))]
    ]

print("df_bulk_dft.shape:", "\n", df_bulk_dft.shape)
print("df_bulk_dft.index.unique().shape:", "\n",
    df_bulk_dft.index.unique().shape)

df_bulk_dft.shape: 
 (740, 11)
df_bulk_dft.index.unique().shape: 
 (740,)


# Reorder index by Stoicheomtry first and then by energy

In [7]:
ab2_indices = df_bulk_dft[df_bulk_dft["stoich"] == "AB2"].sort_values(
    "energy_pa").index.tolist()
ab3_indices = df_bulk_dft[df_bulk_dft["stoich"] == "AB3"].sort_values(
    "energy_pa").index.tolist()

ab2_indices_not_in_dij = [i for i in ab2_indices if i not in df_dij_dft.index]

new_ind_order = ab2_indices + ab3_indices
new_index_order_filtered = [i for i in new_ind_order if i in df_dij_dft.index]

df_dij_dft = df_dij_dft.reindex(new_index_order_filtered)
df_dij_dft = df_dij_dft[new_index_order_filtered]

In [8]:
print("len(ab2_indices):", len(ab2_indices))
print("len(ab3_indices):", len(ab3_indices))
print("")
print("df_dij_dft.shape:", df_dij_dft.shape)

len(ab2_indices): 488
len(ab3_indices): 252

df_dij_dft.shape: (740, 740)


# Reorder index to put OER bulk systems first

In [9]:
oer_sys_ids = ['IrO3_rutile-like', 'IrO3', 'IrO3_battery', 'IrO2']

non_oer_ids = df_dij_dft.index.drop(oer_sys_ids)
new_index_order = oer_sys_ids + non_oer_ids.tolist()

df_dij_dft = df_dij_dft.reindex(new_index_order)
df_dij_dft = df_dij_dft[new_index_order]

# Drop ids that were identified to be redundant

In [10]:
# df_dij_dft = df_dij_dft.drop(labels=ids_to_drop_prev, axis=0)
# df_dij_dft = df_dij_dft.drop(labels=ids_to_drop_prev, axis=1)

In [11]:
df_dij_dft.loc["IrO3_rutile-like"][df_dij_dft.loc["IrO3_rutile-like"] < 0.01]

index
IrO3_rutile-like    0.000000
b5cgvsb16w          0.000157
Name: IrO3_rutile-like, dtype: float64

# Create D_ij Matrix Plot

In [12]:
if create_plot:
    data = plot_dij_matrix_heatmap(
        df_dij_dft,
        d_thresh,
        e_thresh)

    layout = go.Layout(width=1100, height=1100)
    fig = go.Figure(data=data, layout=layout)

    fig = my_plotly_plot(
        figure=fig,
        plot_name='irox_dij_heatmap',
        # write_pdf_svg=True,
        write_html=True,
        write_png=True,
        write_pdf=False,
        write_svg=False,
        )

Writing pdf with ORCA


In [13]:
# fig

# Analyzing systems that are duplicates

In [14]:
df_dij_ab2 = df_dij_dft.loc[ab2_indices, ab2_indices]
df_dij_ab3 = df_dij_dft.loc[ab3_indices, ab3_indices]

In [15]:
def ids_to_elim(df_dij):
    """
    """
    index_to_eliminate = []
    for i_cnt, (name_i, row_i) in enumerate(df_dij.iterrows()):
        cols_below_thresh = row_i[row_i < d_thresh]
        if cols_below_thresh.shape[0] > 1:
            df_i = df_bulk_dft.loc[cols_below_thresh.index]
            index_to_eliminate += df_i.iloc[1:].index.tolist()

    index_to_eliminate = list(set(index_to_eliminate))

    return(index_to_eliminate)

In [16]:
ids_to_elim_ab2 = ids_to_elim(df_dij_ab2)
ids_to_elim_ab3 = ids_to_elim(df_dij_ab3)

all_ids_to_elim = {
    "AB2": ids_to_elim_ab2,
    "AB3": ids_to_elim_ab3,
    }


print("len(ids_to_elim_ab2):", len(ids_to_elim_ab2))
print("len(ids_to_elim_ab3):", len(ids_to_elim_ab3))

len(ids_to_elim_ab2): 92
len(ids_to_elim_ab3): 63


## Saving ids of systmes that are duplicates

In [17]:
# Pickling data ######################################################
import os; import pickle
directory = "out_data"
if not os.path.exists(directory): os.makedirs(directory)

# TODO | Don't create this one anymore
with open(os.path.join(directory, "all_ids_to_elim_1.pickle"), "wb") as fle:
    pickle.dump(all_ids_to_elim, fle)

with open(os.path.join(directory, "all_ids_to_elim.pickle"), "wb") as fle:
    pickle.dump(all_ids_to_elim, fle)
# #####################################################################

In [21]:
df_bulk_dft[df_bulk_dft.stoich == "AB2"].sort_values("dH")

Unnamed: 0_level_0,atoms,energy_pa,form_e_chris,id,id_old,path,source,stoich,energy,dH,num_atoms
id_unique,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
cubqbpzd7k,"(Atom('O', [3.1853, 4.44115, 0.0], index=0), A...",-7.049062,,,473,/global/cscratch1/sd/flores12/IrOx_Project_tem...,raul,AB2,-676.709950,-0.839922,96
6qcdb4bov2,"(Atom('O', [12.46625, 4.81539, 3.98117], index...",-7.048875,,,655,/global/cscratch1/sd/flores12/IrOx_Project_tem...,raul,AB2,-676.691954,-0.839734,96
64cg6j9any,"(Atom('O', [4.43912, 4.78444, 3.20932], index=...",-7.047516,,,177,/scratch/users/flores12/PROJ_irox_ml_oer/ml_bu...,raul,AB2,-169.140375,-0.838375,24
b46enqnq8e,"(Atom('O', [3.2089, 5.18819, 0.0], index=0), A...",-7.047508,,,162,/scratch/users/flores12/PROJ_irox_ml_oer/ml_bu...,raul,AB2,-169.140203,-0.838368,24
9yz2mt8hbh,"(Atom('Ir', [0.0, 0.0, 0.0], index=0), Atom('I...",-7.047426,,,513,/global/cscratch1/sd/flores12/IrOx_Project_tem...,raul,AB2,-84.569115,-0.838286,12
...,...,...,...,...,...,...,...,...,...,...,...
brbizonjmy,"(Atom('O', [0.23164, 2.83949, 1.11906], index=...",-5.477754,,,132,/global/cscratch1/sd/flores12/IrOx_Project_tem...,raul,AB2,-16.433262,0.731386,3
6l7fnyctmt,"(Atom('Ir', [1.123, 5.61647, 1.123], index=0),...",-5.427689,,,161,/scratch/users/flores12/PROJ_irox_ml_oer/ml_bu...,raul,AB2,-130.264524,0.781452,24
7umymtza7a,"(Atom('O', [2.95957, 0.0, 1.31196], index=0), ...",-5.416683,,,185,/scratch/users/flores12/PROJ_irox_ml_oer/ml_bu...,raul,AB2,-32.500095,0.792458,6
7am4vwzi7p,"(Atom('O', [1.26709, 1.26709, 3.80126], index=...",-4.978113,,,385,/global/cscratch1/sd/flores12/IrOx_Project_tem...,raul,AB2,-59.737356,1.231027,12


In [22]:
# df_dij_ab3.loc["8p8evt9pcg", "zimixdvdxd"]

df_dij_ab3.loc["xw9y6rbkxr", "zimixdvdxd"]

0.044773534305960316

In [19]:
assert False

AssertionError: 

# TEST | TEST | TEST

In [None]:
# df_dij_dft.loc["7h7yns937p"]
df_dij_dft.shape

"7h7yns937p" in df_dij_dft.index

In [None]:

ids_to_elim_ab3 = all_ids_to_elim["AB3"]

print(len(ab3_indices))

unique_ids_ab3 = [i for i in ab3_indices if i not in ids_to_elim_ab3]

data_dict_list = []
for id_i in unique_ids_ab3:
    if id_i in df_dij_dft.index:
        num_duplicates = len(df_dij_dft.loc[id_i][df_dij_dft.loc[id_i] < d_thresh]) - 1,
        dict_i = {
            "id_unique": id_i,
            "num_duplicates": num_duplicates[0],
            }
        data_dict_list.append(dict_i)
    else:
        pass


df_tmp = pd.DataFrame(data_dict_list)

df_tmp.sort_values("num_duplicates", ascending=False)
# data_dict_list

In [None]:
# TEMP
# df_dij_dft = df_dij_dft.loc[all_ids_to_elim, all_ids_to_elim]


ids_dict_master = {}
for i_cnt, (name_i, row_i) in enumerate(df_dij_dft.iterrows()):
    # tmp = row_i[row_i < d_thresh]
    tmp = row_i[row_i < d_thresh].drop(name_i)
    df_i = df_bulk_dft.loc[tmp.index]

    # ids_dict_list_i = {i_cnt: df_i.index.sort_values().tolist()}
    # ids_dict_lists.append(ids_dict_list_i)
    if len(df_i) > 0:
        equiv_ids_list = df_i.index.sort_values().tolist()
        id_joined_str = "_".join(equiv_ids_list)

        # ids_dict_master[i_cnt] = df_i.index.sort_values().tolist()
        ids_dict_master[name_i] = {
            "id_joined_str": id_joined_str,
            "equiv_ids_list": equiv_ids_list,
            }

# #############################################################################
# df_i.index.sort_values().tolist()

df_test = pd.DataFrame(ids_dict_master,
#     index=ids_dict_master.keys()
#     index=["id_str_joined"],
    ).T


# df_test["id_str_joined"].unique().shape

df_test

In [None]:
df_dij_dft.loc["8p8evt9pcg", "zimixdvdxd"]

In [None]:
# ids_dict_master
# tmp_list = []
# for key_i, val_i in ids_dict_master.items():
#     for key_j, val_j in ids_dict_master.items():

#         if key_i == key_j:
#             continue

#         print(key_i, key_j)

#         if val_j == val_i:
#         else:
#             tmp_list.append(key_i)

# ids_dict_master

# if val_j == val_i:

# len(all_ids_to_elim)


# print(len([i for i in all_ids_to_elim if i in ab2_indices]))
# print(len([i for i in all_ids_to_elim if i in ab3_indices]))

# # np.fill_diagonal(df_dij_dft.values, np.nan)
# # e_thresh = 0.01
# use_energy_simil = False

# trouble_ids_list = []

# unique_id_list = []
# all_ids_to_elim = []
# for i_cnt, (name_i, row_i) in enumerate(df_dij_dft.iterrows()):
#     tmp = row_i[row_i < d_thresh]

#     # if len(tmp) > 1:
#     #     break

#     if tmp.shape[0] == 1:
#         mess = "No other structures close to this one"
#         # print(mess)
#         unique_id_list.append(tmp.index[0])
#     else:
#         df_i = df_bulk_dft.loc[tmp.index]

#         # if "8k7expx2bp" in df_i.index.tolist():
#         # if "6s648e8s6p" in df_i.index.tolist():
#         # if "b5cgvsb16w" in df_i.index.tolist():
#         #     display(df_i)

#         e_range = abs(df_i["energy_pa"].min() - df_i["energy_pa"].max())
#         e_thresh_u = df_i.loc[name_i]["energy_pa"] + e_thresh
#         e_thresh_l = df_i.loc[name_i]["energy_pa"] - e_thresh

#         # Using enery similarity criteria
#         if use_energy_simil:
#             df_j = df_i[
#                 (df_i["energy_pa"] < e_thresh_u) &
#                 (df_i["energy_pa"] > e_thresh_l)]
#             index_to_keep_i = df_j.sort_values("energy_pa").iloc[0].name
#             index_to_eliminate = df_j.iloc[1:].index.tolist()
#         else:
#             index_to_keep_i = df_i.sort_values("energy_pa").iloc[0].name
#             index_to_eliminate = df_i.iloc[1:].index.tolist()


#         all_ids_to_elim += index_to_eliminate

#         if e_range > e_thresh:
#             # display(df_i)
#             df_i_tmp = df_i

#             ids_tmp = df_i_tmp.index.tolist()
#             trouble_ids_list += ids_tmp

#             # print("Energies span greater range than 'e_thresh'")
#             # print(e_range)
#             # print("")

# all_ids_to_elim = list(set(all_ids_to_elim))

# trouble_ids_list = list(set(trouble_ids_list))

# #############################################################################
# Drop ab2 stoicheomtry

# df_dij_dft = df_dij_dft.drop(labels=ab2_indices, axis=0)
# df_dij_dft = df_dij_dft.drop(labels=ab2_indices, axis=1)

# dft_indices = ab2_indices + ab3_indices
# non_dft_indices = [i for i in df_dij_dft.index if i not in dft_indices]