In [1]:
import json
from pymatgen.core.structure import Structure
from pymatgen.core.periodic_table import Element
import os
current_dir = os.path.dirname(os.path.realpath('__file__'))
os.chdir('../../data_scripts')
from crystal_funcs import clean_oxide
os.chdir(current_dir)
import numpy as np
import pandas as pd

In [2]:
file_path = "2022-10-19-wbm-computed-structure-entries+init-structs.json"

with open(file_path) as file:
    data = json.load(file)


In [3]:
type(data)

dict

In [4]:
data.keys()

dict_keys(['material_id', 'formula_from_cse', 'initial_structure', 'computed_structure_entry'])

initial_structure are unrelaxed. computed_structure_entry['structure'] are the relaxed structures.

In [5]:
wbm_data = list(data['computed_structure_entry'].values())

In [6]:
for d in wbm_data:
    d['material_id'] = d['entry_id']
    del d['entry_id']
    d['structure'] = Structure.from_dict(d['structure'])

In [7]:
oxs = [d for d in wbm_data if Element('O') in d['structure'].elements]
# filter out structures without oxygen
print(len(oxs))

27946


In [8]:
# oxs_short = oxs[:100]

# good_data = clean_oxide(experimental=False, pymatgenArray = oxs_short,
good_data = clean_oxide(experimental=False, pymatgenArray = oxs,
                                     reportBadData=False, read_oxide_type = False)


The initial data length is 27946
The number of entries with anions other than Oxygen were 4232
The number of entries with different oxidation types were 1633
The number of entries where valence/oxidation could not be analyzed were 7987
The number of entries where the primitive structure could not be calculated were 0
The length of data after removing undesired entries is  14094


In [9]:
# good_data[0]

In [10]:
df = pd.DataFrame.from_records(good_data)
df = df.drop(columns=['@module', '@class', 'data'])


In [11]:
df.columns

Index(['energy', 'composition', 'correction', 'parameters', 'structure',
       'material_id'],
      dtype='object')

In [12]:
df.to_pickle('wbm_oxides.pkl')

Switching to an array seems difficult and unnecessary. But, it seems we need to add a material_id component to each data point. 
That could be obtained through data['material_id'] of the same index. Don't worry too much about sampling the correct index at first; we're gonna use the entire data after the pipeline is written anyways.

In [13]:
df.head()

Unnamed: 0,energy,composition,correction,parameters,structure,material_id
0,-61.336153,"{'O': 5.0, 'Na': 1.0, 'Bi': 1.0, 'B': 2.0}",0.0,"{'potcar_symbols': ['PAW_PBE Na_pv 05Jan2001',...","[[1.71238096 1.02567987 3.29744478] Na, [1.270...",wbm-1-4374
1,-87.844284,"{'Ba': 2.0, 'O': 6.0, 'B': 2.0, 'Eu': 1.0}",0.0,"{'potcar_symbols': ['PAW_PBE Eu 08Apr2002', 'P...",[[-8.91300683e-05 3.43426255e+00 6.10967762e...,wbm-1-4375
2,-80.455899,"{'Ba': 2.0, 'O': 6.0, 'B': 2.0, 'Yb': 1.0}",0.0,"{'potcar_symbols': ['PAW_PBE B 06Sep2000', 'PA...","[[-2.74103904 1.88398374 4.42993739] B, [2.7...",wbm-1-4376
3,-79.000502,"{'Ba': 3.0, 'O': 6.0, 'B': 2.0}",0.0,"{'potcar_symbols': ['PAW_PBE B 06Sep2000', 'PA...","[[-2.74266703 2.05258232 4.87025798] B, [2.7...",wbm-1-4377
4,-84.876555,"{'Hf': 1.0, 'Ba': 1.0, 'B': 2.0, 'O': 6.0}",0.0,"{'potcar_symbols': ['PAW_PBE B 06Sep2000', 'PA...","[[2.59805358 1.49998554 1.65859481] B, [7.7941...",wbm-1-4378


In [20]:
save_dir = "../../predict_target/label_alignn_format/poscars_for_synth_prediction/wbm_oxides"
os.makedirs(save_dir, exist_ok=True)

In [29]:
# writing poscars into save_dir for prediction
for i, row in df.iterrows():
    crystal = row['structure']
    filename = f"POSCAR-{row['material_id']}.vasp"
    filepath = os.path.join(save_dir, filename)
    crystal.to(filename=filepath, fmt='poscar')
    # break

In [32]:
file_count = len(os.listdir(save_dir))
print(f"Number of files in {save_dir}: {file_count}")

Number of files in ../../predict_target/label_alignn_format/poscars_for_synth_prediction/wbm_oxides: 14094
