## Extracting and Storing States From Monte Carlo Runs
In this notebook, we'll extract the staes (ASE supercells) stored with our Monte Carlo run (see "job_script.sb" for the example slurm job).

We'll save our states into numpy arrays, which can then be used for initial states for Kinetic Monte Carlo calculations.

Typically, as shown in our example job script, we do multiple Monte Carlo runs (also called Monte Carlo trajectories), each starting from an independent random state. Since the ultimate aim is to train machine learning models, we divide states from these trajectories equally into training and validation states.

In [1]:
import numpy as np
from ase.spacegroup import crystal as crystal_ASE
from ase.build import make_supercell
import pickle
from onsager import crystal, supercell
import h5py
from tqdm import tqdm

In [2]:
elems = ["Co", "Ni", "Cr", "Fe", "Mn"]
elemsToIndices = {"Co":0, "Ni":1, "Cr":2, "Fe":3, "Mn":4}
elemsToNum = {}
for elemInd, el in enumerate(elems):
    elemsToNum[el] = elemInd + 1

In [3]:
# load crystal data for orthogonal FCC supercells
N_units = 5
with h5py.File("../../CrysDat_FCC/CrystData_ortho_{}_cube.h5".format(N_units), "r") as fl:
    lattice = np.array(fl["Lattice_basis_vectors"])
    superlatt = np.array(fl["SuperLatt"])
    basis_cubic = np.array(fl["basis_sites"])

a0=3.595
crys = crystal.Crystal(lattice=lattice*a0, basis=[[b for b in basis_cubic]], chemistry=["A"], noreduce=True)
print(crys)

superFCC_onsg = supercell.ClusterSupercell(crys, superlatt)
print(len(superFCC_onsg.mobilepos))

#Lattice:
  a1 = [3.595 0.    0.   ]
  a2 = [0.    3.595 0.   ]
  a3 = [0.    0.    3.595]
#Basis:
  (A) 0.0 = [0. 0. 0.]
  (A) 0.1 = [0.  0.5 0.5]
  (A) 0.2 = [0.5 0.  0.5]
  (A) 0.3 = [0.5 0.5 0. ]
500


## Now we go ahead and extract the states

In [4]:
def get_states(Temp, lowerTrajId, upperTrajId, startSamp, EndSamp, MC_interval, Nsites, a0=3.595):
    """
    Function to extract ASE supercells, check them and store into numpy arrays.
    :param: Temp - the temperature.
    
    For the following inputs, please also refer to example job script "job_script.sb" as well as the next
    cell in this notebook.
    
    :param: lowerTrajId - the index of the first trajectory to gather states from.
    :param: upperTrajId - the index of the last trajectory to gather states from.
    
    :param: startSamp - the MC step starting from which to gather states
                        
    :param: EndSamp - the last MC step.
    
    :param: MC_interval - the intervals at which to gather states.
                          
    :param: Nsites - the number of sites (512 in the example run).
    
    :param: a0: the lattice parameter (3.595 Angstroms)
    """
    # initialize the state array
    # Remember the supercells have had the (0., 0., 0.) site deleted for the vacancy
    states = []
    state_Energies = []
    
    total = 0
    
    # The "jobID" variable in the example script was set to 1
    # In our actual runs, 4 more jobs with jobID = 2, 3, 4 and 5 were also run,
    # each with atoms counts of Co, Ni, Cr, Fe and Mn as specified below
    
    count_1 = np.array([99, 100, 100, 100, 100]) # atom counts for jobID=1
    count_2 = np.array([100, 99, 100, 100, 100]) # atom counts for jobID=2
    count_3 = np.array([100, 100, 99, 100, 100]) # atom counts for jobID=3
    count_4 = np.array([100, 100, 100, 99, 100]) # atom counts for jobID=4
    count_5 = np.array([100, 100, 100, 100, 99]) # atom counts for jobID=5
    
    subJob = 1 # the "jobID" variable in the example script
    # when extracting from 5 different jobs as mentioned above
    # for subJob in range(1, 6):
    print("Subjob: {}".format(subJob), flush=True)
    for traj in tqdm(range(lowerTrajId, upperTrajId + 1), ncols=65, position=0, leave=True):

        dr = "{0}_{1}/{0}_{1}_{2}/".format(Temp,subJob,traj)
        Run_Energies = np.load("{0}_{1}/{0}_{1}_{2}/Eng_all_steps.npy".format(Temp,subJob,traj))

        for samp in range(startSamp, EndSamp + 1, MC_interval):
            
            # get the saved supercell
            fileName = dr+"chkpt/supercell_{}.pkl".format(samp)

            with open(fileName, "rb") as fl:
                superFCC = pickle.load(fl)

            # check the supercell composition
            elemCounts = np.zeros(len(elems), dtype=int)
            for at_Ind in range(len(superFCC)):
                elem = superFCC[at_Ind].symbol
                idx = elemsToIndices[elem]
                elemCounts[idx] += 1

            if subJob == 1:
                assert np.array_equal(elemCounts, count_1)

            elif subJob == 2:
                assert np.array_equal(elemCounts, count_2)

            elif subJob == 3:
                assert np.array_equal(elemCounts, count_3)

            elif subJob == 4:
                assert np.array_equal(elemCounts, count_4)

            else:
                assert np.array_equal(elemCounts, count_5)

            # Check that the supercells are always consistent with onsager and store occupancies

            assert np.allclose(superFCC.cell[:], superFCC_onsg.lattice)
            assert np.allclose(superFCC.cell[:]/N_units, superFCC_onsg.crys.lattice)
            assert len(superFCC) == Nsites - 1

            occs = np.zeros(len(superFCC) + 1, dtype=np.int8)
            for site in range(len(superFCC)):
                assert not np.allclose(superFCC[site].position, 0.)
                xs = superFCC[site].position
                Rsite, ciSite = superFCC_onsg.crys.cart2pos(xs)
                try:
                    siteInd, _ = superFCC_onsg.index(Rsite, ciSite)
                except:
                    print(xs)
                    raise ValueError("Site not found?")
                assert siteInd > 0
                assert siteInd == superFCC[site].index + 1 == site + 1, "{} {} {} {}".format(Rs, Rsite, siteInd,
                                                                              superFCC[site].index)
                occs[site + 1] = elemsToNum[superFCC[site].symbol]

            states.append(occs)
            state_Energies.append(Run_Energies[samp])
            total += 1
                
    print("Gathered total {} states from trajectories {} to {} "
          "in each subjob 1 to 5.".format(total, lowerTrajId, upperTrajId))
    
    return states, state_Energies

In [5]:
Nsites = len(superFCC_onsg.mobilepos)

T=1073
print("T : {}".format(T), flush=True)
# Gather states from trajectories 1 to 20 for the training set
states_1_to_20, Energies_1_to_20 = get_states(T, 1, 20, 10000, 10000 + 99 * 1000, 1000, Nsites)

# Then from 21 to 40 for the validation set
states_21_to_40, Energies_21_to_40 = get_states(T, 21, 40, 10000, 10000 + 99 * 1000, 1000, Nsites)
print("\n\n", flush=True)

T : 1073
Subjob: 1


100%|████████████████████████████| 20/20 [06:25<00:00, 19.29s/it]

Gathered total 2000 states from trajectories 1 to 20 in each subjob 1 to 5.
Subjob: 1



100%|████████████████████████████| 20/20 [06:31<00:00, 19.56s/it]

Gathered total 2000 states from trajectories 21 to 40 in each subjob 1 to 5.








In [7]:
# Then save as numpy files to use as initial states for KMC simulations

states_1_to_20 = np.array(states_1_to_20, dtype=np.int8)
states_21_to_40 = np.array(states_21_to_40, dtype=np.int8)

Energies_1_to_20 = np.array(Energies_1_to_20)
Energies_21_to_40 = np.array(Energies_21_to_40)

assert states_1_to_20.shape[1] == states_21_to_40.shape[1] == len(superFCC_onsg.mobilepos)

statesAll = np.zeros((states_1_to_20.shape[0] + states_21_to_40.shape[0],
                      len(superFCC_onsg.mobilepos)), dtype=np.int8)

EnergiesAll = np.zeros(states_1_to_20.shape[0] + states_21_to_40.shape[0])

statesAll[:states_1_to_20.shape[0], :] = states_1_to_20[:, :]
statesAll[states_1_to_20.shape[0]:, :] = states_21_to_40[:, :]

EnergiesAll[:states_1_to_20.shape[0]] = Energies_1_to_20[:]
EnergiesAll[states_1_to_20.shape[0]:] = Energies_21_to_40[:]

assert np.all(statesAll[:, 0] == 0)

np.save("statesAll_{}.npy".format(T), statesAll)
np.save("statesEnergies_CG_{}.npy".format(T), EnergiesAll)