# New ML Active Learning Workflow
---

# Import Modules

In [1]:
import os
import sys
import copy

import pickle

import pandas as pd

from sklearn.decomposition import PCA

from protosearch.ml_modelling.fingerprint import (
    FingerPrint,
    VoronoiFingerprint
    )

# #############################################################################
pd.set_option('display.max_rows', None)
os.getcwd()



# #############################################################################
sys.path.insert(0, os.path.join(os.environ["PROJ_irox"], "data"))
# from proj_data_irox import (
# #     df_features_path,
# #     df_features_cleaned_path,
# #     df_features_cleaned_pca_path,
#     )

from proj_data_irox import (
    bulk_dft_data_path,
    unique_ids_path,
    prototypes_data_path,
    static_irox_structures_path,
    static_irox_structures_kirsten_path,
#     oqmd_irox_data_path,
    
    df_features_pre_opt_path,
    df_features_pre_opt_kirsten_path,
    df_features_post_opt_path,
    )

In [2]:
directory = "out_data"
if not os.path.exists(directory):
    os.makedirs(directory)

# Read Data

In [3]:
with open(bulk_dft_data_path, "rb") as fle:
    df_bulk_dft = pickle.load(fle)

with open(static_irox_structures_path, "rb") as fle:
    df_struct = pickle.load(fle)


with open(static_irox_structures_kirsten_path, "rb") as fle:
    df_struct_kirsten = pickle.load(fle)

df_ids = pd.read_csv(unique_ids_path)

# Remove rows with missing atoms objects

In [4]:
# Removing missing data
df_bulk_dft = df_bulk_dft[df_bulk_dft["atoms"].notnull()]
df_struct = df_struct[df_struct["atoms"].notnull()]

In [5]:
df_struct.columns

Index(['id_old', 'atoms', 'stoich', 'path', 'source', 'static_id'], dtype='object')

# Processing pre-opt Fingerprints

In [6]:
# FP_struct = FingerPrint(**{
#     "feature_methods": ["voronoi"],
#     "input_data": df_struct,
# #     "input_data": df_combined,
#     "input_index": ["atoms"]})

# FP_struct.generate_fingerprints()
# df_features_pre_opt = FP_struct.fingerprints

# # #############################################################################
# with open(df_features_pre_opt_path, "wb") as fle:
#     pickle.dump(df_features_pre_opt, fle)

In [7]:
# FP_struct = FingerPrint(**{
#     "feature_methods": ["voronoi"],
#     "input_data": df_struct_kirsten,
# #     "input_data": df_combined,
#     "input_index": ["atoms"]})

# FP_struct.generate_fingerprints()
# df_features_pre_opt = FP_struct.fingerprints

# # #############################################################################
# with open(df_features_pre_opt_kirsten_path, "wb") as fle:
#     pickle.dump(df_features_pre_opt, fle)

In [8]:
with open(df_features_pre_opt_path, "rb") as fle:
    df_features_pre_opt = pickle.load(fle)

with open(df_features_pre_opt_kirsten_path, "rb") as fle:
    df_features_pre_opt_kirsten = pickle.load(fle)

In [9]:
with open(df_features_post_opt_path, "rb") as fle:
    df_features_post_opt = pickle.load(fle)

In [10]:
df_features_post_opt["data", "INDEX_OLD"] = df_features_post_opt.index
df_features_post_opt["data", "INDEX_NEW"] = df_features_post_opt["data", "INDEX_OLD"] + "_" + df_features_post_opt["data"]["source"]

df_features_post_opt = df_features_post_opt.set_index(df_features_post_opt["data", "INDEX_NEW"])
df_features_post_opt = df_features_post_opt.drop(labels=[["data", "INDEX_NEW"]], axis=1)

print("df_features_post_opt.shape:", df_features_post_opt.shape)

df_features_post_opt.shape: (836, 273)


In [11]:
df_bulk_dft["INDEX_NEW"] = df_bulk_dft.index + "_" + df_bulk_dft["source"]
df_bulk_dft["INDEX_OLD"] = df_bulk_dft.index
df_bulk_dft = df_bulk_dft.set_index("INDEX_NEW")

In [12]:
ids_to_process = [i for i in df_bulk_dft.index if i not in df_features_post_opt.index]
df_bulk_dft_not_processed = df_bulk_dft.loc[ids_to_process]

In [13]:
print("df_bulk_dft_not_processed.shape:", df_bulk_dft_not_processed.shape)
# df_bulk_dft_not_processed.head()

df_bulk_dft_not_processed.shape: (3, 12)


In [14]:
# df_bulk_dft[df_bulk_dft["stoich"] == "AB2"].sort_values("energy_pa")

In [15]:
FP_struct = FingerPrint(**{
    "feature_methods": ["voronoi"],
    "input_data": df_bulk_dft_not_processed,
    "input_index": ["atoms"]})

FP_struct.generate_fingerprints()
df_features_post_opt_new = FP_struct.fingerprints

# Add the 'source' column to features dataframe since there are duplicate ids
# due to the fact that Chris and I ran the same structures
df_features_post_opt_new["data", "source"] = df_bulk_dft_not_processed["source"]
df_features_post_opt_new["data", "INDEX_OLD"] = df_bulk_dft_not_processed["INDEX_OLD"]

Generate Voronoi fingerprint of 3 structures


In [16]:
df_features_post_opt_comb = pd.concat([
    df_features_post_opt,
    df_features_post_opt_new])

nan_mask = df_features_post_opt_comb["voronoi"].isnull().any(axis="columns")

df_features_post_opt_comb_cpy = copy.deepcopy(df_features_post_opt_comb)

df_features_post_opt_comb = df_features_post_opt_comb.loc[~nan_mask]
df_nan_in_voro = df_features_post_opt_comb_cpy.loc[nan_mask]

print(
    "Does `df_features_post_opt` have any NaN values in it: ",
    "\n -->",
    df_features_post_opt_comb["voronoi"].isnull().any(axis="columns").any())

Does `df_features_post_opt` have any NaN values in it:  
 --> False


In [17]:
old_indices = df_features_post_opt_comb["data", "INDEX_OLD"]

df_features_post_opt_comb = df_features_post_opt_comb.set_index(old_indices)
# df_features_post_opt_comb

In [18]:
index_renamed = df_features_post_opt_comb.index.rename("id_unique")
df_features_post_opt_comb = df_features_post_opt_comb.set_index(index_renamed)
df_features_post_opt_comb = df_features_post_opt_comb.drop(("data", "INDEX_OLD"), axis=1)

index_renamed = df_bulk_dft.index.rename("id_unique")
df_bulk_dft = df_bulk_dft.set_index(index_renamed)
df_bulk_dft = df_bulk_dft.drop(("INDEX_OLD"), axis=1)

In [19]:
#############################################################################
with open(df_features_post_opt_path, "wb") as fle:
    pickle.dump(df_features_post_opt_comb, fle)

In [20]:
ab2_ids = df_ids[df_ids["stoich"] == "AB2"]["unique_ids"]

tmp = df_features_pre_opt.loc[ab2_ids].describe().loc["std"].tolist()
print("df_features_pre_opt.shape:", df_features_pre_opt.shape)

len([i for i in tmp if i < 0.00000000001])

print(df_features_pre_opt.loc[:, df_features_pre_opt.loc[ab2_ids].describe().loc["std"] < 0.00001].shape)
tmpa = df_features_pre_opt.loc[:, df_features_pre_opt.loc[ab2_ids].describe().loc["std"] < 0.00001]

tmpa = [i[1] for i in tmpa.columns]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  This is separate from the ipykernel package so we can avoid doing imports until


df_features_pre_opt.shape: (967, 271)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


(967, 170)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  if __name__ == '__main__':


In [21]:
ab2_ids = df_ids[df_ids["stoich"] == "AB3"]["unique_ids"]

tmp = df_features_pre_opt.loc[ab2_ids].describe().loc["std"].tolist()
print("df_features_pre_opt.shape:", df_features_pre_opt.shape)

len([i for i in tmp if i < 0.00000000001])

print(df_features_pre_opt.loc[:, df_features_pre_opt.loc[ab2_ids].describe().loc["std"] < 0.00001].shape)
tmpb = df_features_pre_opt.loc[:, df_features_pre_opt.loc[ab2_ids].describe().loc["std"] < 0.00001]

tmpb = [i[1] for i in tmpb.columns]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  This is separate from the ipykernel package so we can avoid doing imports until


df_features_pre_opt.shape: (967, 271)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


(967, 170)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  if __name__ == '__main__':


In [22]:

tmpa

[i for i in tmpa if i not in tmpb]

len(set(tmpa) & set(tmpb))

len(tmpa)

170

In [23]:
271 - 170

# 125 - 102

101