In [25]:
import gzip
import json
import re
from pathlib import Path

import numpy as np
import pandas as pd
from pymatgen.core import Structure
from tqdm.auto import tqdm

tqdm.pandas()

structure_dir = Path("/projects/rlmolecule/jlaw/inputs/structures")

inputs_dir = Path("/projects/rlmolecule/pstjohn/crystal_inputs/")
volrelax_dir = Path("/projects/rlmolecule/pstjohn/volume_relaxation_outputs/")


def get_structures(filename):
    with gzip.open(filename, "r") as f:
        for key, structure_dict in tqdm(json.loads(f.read().decode()).items()):
            structure = Structure.from_dict(structure_dict)
            yield {"id": key, "structure": structure}


battery_relaxed_file = Path(structure_dir, "battery/battery_relaxed_structures.json.gz")
battery_unrelaxed_file = Path(structure_dir, "battery/battery_unrelaxed_structures.json.gz")
battery_volrelaxed_file = Path("/projects/rlmolecule/jlaw/inputs/structures/battery/volrelax/battery_vol_rerelax_scf_structures.json.gz")
icsd_strcs_file = Path(structure_dir, "icsd/icsd_structures.json.gz")

batt_energy_file = Path(structure_dir, 'battery/battery_relaxed_energies.csv')
vol_energy_file = Path(volrelax_dir, "20220504_volume_relaxed_energy.csv.gz")
icsd_energy_file = Path(structure_dir, "icsd/icsd_energies.csv")

In [5]:
vol_energy = pd.read_csv(vol_energy_file)

prashun_vol = pd.read_csv('/projects/rlmolecule/pgorai/vol-rerelax-scf/volrerelax_scf_energy.csv')
vol_energy = vol_energy.append(
    prashun_vol.rename(columns={'decoration': 'id', 'numatoms': 'num_sites', 'composition_type': 'comptype'})
).drop(['composition', 'dls_volume'], axis=1)

In [6]:
batt_energy = pd.read_csv(Path(batt_energy_file))
icsd_energy = pd.read_csv(icsd_energy_file)

vol_energy['composition'] = vol_energy.id.str.extract('([a-zA-Z0-9]+)_')
vol_energy['type'] = 'vol'
batt_energy['type'] = 'relax'
icsd_energy['type'] = 'icsd'

In [26]:
batt_structures = pd.DataFrame(get_structures(battery_relaxed_file))
unrelaxed_structures = pd.DataFrame(get_structures(battery_unrelaxed_file))
vol_structures = pd.DataFrame(get_structures(battery_volrelaxed_file))
icsd_structures = pd.DataFrame(get_structures(icsd_strcs_file))

  0%|          | 0/67840 [00:00<?, ?it/s]

  0%|          | 0/67840 [00:00<?, ?it/s]

  0%|          | 0/7083 [00:00<?, ?it/s]

  0%|          | 0/16445 [00:00<?, ?it/s]

In [36]:
vol_structures = unrelaxed_structures.append(vol_structures).drop_duplicates(subset='id', keep='first')

vol_df = vol_structures.merge(vol_energy, on='id')
print(vol_df.iloc[0].structure.volume)
_ = vol_df.progress_apply(lambda x: x.structure.scale_lattice(x.volume), axis=1)
print(vol_df.iloc[0].structure.volume)

266.79110550669


  0%|          | 0/69510 [00:00<?, ?it/s]

484.8785392007747


In [40]:
merged = pd.concat([
    icsd_structures.merge(icsd_energy, on='id'),
    vol_df,
    batt_structures.merge(batt_energy, on='id')
])[['id', 'type', 'composition', 'energyperatom', 'structure']]

In [41]:
merged['volume'] = merged.structure.progress_apply(lambda x: x.volume)

  0%|          | 0/153795 [00:00<?, ?it/s]

In [44]:
merged['num_sites'] = merged.structure.progress_apply(lambda x: x.num_sites)

  0%|          | 0/153795 [00:00<?, ?it/s]

In [46]:
merged.dropna().reset_index(drop=True).to_pickle(Path(inputs_dir, "20220603_all_structures.p"))

In [None]:
import sys

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2
sys.path.append('..')
from preprocess import preprocessor, preprocess_s

In [None]:
merged['volume'] = merged.structure.progress_apply(lambda x: x.volume)

In [None]:
preprocessor(merged.iloc[0].structure)