## Extracting KMC Data for Making Single-step Dataset
In this notebook, we extract the data from our 2-step KMC runs (as shown in sample slurm script "job_script_KMC.sb"). We then save the data in an hdf5 file in a format that can be used in our machine learning and cluster expansion codes.

This notebook also serves as a more hands-on insight into the outputs of the KMC code.

In [1]:
import numpy as np
import h5py
from tqdm import tqdm
from scipy.constants import physical_constants
kB = physical_constants["Boltzmann constant in eV/K"][0]

In [2]:
from onsager import crystal, supercell

In [3]:
# load crystal data
with h5py.File("../../CrysDat_FCC/CrystData.h5", "r") as fl:
    lattice = np.array(fl["Lattice_basis_vectors"])
    superlatt = np.array(fl["SuperLatt"])
    dxList = np.array(fl["dxList_1nn"])
    JumpNewSites = np.array(fl["JumpSiteIndexPermutation"])
    NNsites = np.array(fl["NNsiteList_sitewise"])

crys = crystal.Crystal.FCC(a0 = 1.0, chemistry=["A"])
assert np.allclose(lattice, crys.lattice)

superFCC = supercell.ClusterSupercell(crys, superlatt)

NNsites_vac = NNsites[1:, 0]

In [4]:
T = 1073

In [5]:
# Make the intervals and subintervals
intervalSize = 2000
startSamps = [0, 2000, 4000, 6000, 8000,
              10000, 12000, 14000, 16000, 18000]

intervals = ["0_2e3", "2e3_4e3", "4e3_6e3", "6e3_8e3", "8e3_10e3",
             "10e3_12e3", "12e3_14e3", "14e3_16e3", "16e3_18e3", "18e3_20e3"]

SubIntervals = [["states_{}_{}".format(x, x + 400) for x in range(st, st + 2000, 400)] for st in startSamps]

for idx in range(len(startSamps)):
    print(startSamps[idx])
    print(SubIntervals[idx])

0
['states_0_400', 'states_400_800', 'states_800_1200', 'states_1200_1600', 'states_1600_2000']
2000
['states_2000_2400', 'states_2400_2800', 'states_2800_3200', 'states_3200_3600', 'states_3600_4000']
4000
['states_4000_4400', 'states_4400_4800', 'states_4800_5200', 'states_5200_5600', 'states_5600_6000']
6000
['states_6000_6400', 'states_6400_6800', 'states_6800_7200', 'states_7200_7600', 'states_7600_8000']
8000
['states_8000_8400', 'states_8400_8800', 'states_8800_9200', 'states_9200_9600', 'states_9600_10000']
10000
['states_10000_10400', 'states_10400_10800', 'states_10800_11200', 'states_11200_11600', 'states_11600_12000']
12000
['states_12000_12400', 'states_12400_12800', 'states_12800_13200', 'states_13200_13600', 'states_13600_14000']
14000
['states_14000_14400', 'states_14400_14800', 'states_14800_15200', 'states_15200_15600', 'states_15600_16000']
16000
['states_16000_16400', 'states_16400_16800', 'states_16800_17200', 'states_17200_17600', 'states_17600_18000']
18000
['sta

In [6]:
# Load the initial states

initStates = np.load("InitStates/statesAll_{0}.npy".format(T))
Nsamples = initStates.shape[0]
print("Nsamples: {}".format(Nsamples))

NSpec = 6 # including the vacancy
N_units = 8

finalStates_UT = np.zeros_like(initStates) # these will not have vacancy at (0, 0, 0)
finalStates = np.zeros_like(initStates) # these WILL have vacancy at (0, 0, 0)

SpecDisps = np.zeros((initStates.shape[0], NSpec, 3)) # displacement of the species in step 1.
SpecDisps_step2 = np.zeros((initStates.shape[0], NSpec, 3)) # displacement of the species in step 2.
t_arr = np.zeros(initStates.shape[0]) # the time for step 1
t_arr_step2 = np.zeros(initStates.shape[0]) # the time for step 2

# Next, rates and barriers for the the 12 jumps out of the initial and final states of step 1.
AllJumpRates_Init = np.zeros((initStates.shape[0], 12))
AllJumpRates_Fin = np.zeros((initStates.shape[0], 12))

AllJumpBarriers_Init = np.zeros((initStates.shape[0], 12))
AllJumpBarriers_Fin = np.zeros((initStates.shape[0], 12))

# random numbers to check for correctness
randomNumbers = np.zeros(initStates.shape[0])
JumpSelects = np.zeros(initStates.shape[0], dtype=np.int8)

randomNumbers_step2 = np.zeros(initStates.shape[0])
JumpSelects_step2 = np.zeros(initStates.shape[0], dtype=np.int8)

# Now load and store all the states
for intervalIdx in range(len(intervals)):
    intervalRun = intervals[intervalIdx]
    
    subInts = SubIntervals[intervalIdx]
    
    startState = startSamps[intervalIdx]
    
    # print(intervalRun, startState)
    for subIntRunIdx in range(len(subInts)):
        subStart = startState + subIntRunIdx * 400
        subEnd = subStart + 400
        
        # Load the data
        # step 1
        states_stored_step0 = np.load("{0}/{1}/states_{2}_{3}/states_step0_{0}.npy".format(T, intervalRun,
                                                                                           subStart, subEnd))
        
        print(subStart, subEnd)
        assert np.array_equal(initStates[subStart : subEnd], states_stored_step0)
        
        with h5py.File("{0}/{1}/states_{2}_{3}/data_{0}_1_{2}.h5".format(T, intervalRun, subStart, subEnd),
                       "r") as fl:
            FinStates_step1 = np.array(fl["FinalStates"])
            Disps_step1 = np.array(fl["SpecDisps"])
            times_step1 = np.array(fl["times"])
            allRates_step1 = np.array(fl["AllJumpRates"])
            allbarriers_step1 = np.array(fl["AllJumpBarriers"])
            rands_step1 = np.array(fl["TestRandNums"])
            JSelects_step1 = np.array(fl["JumpSelects"])

            ISEs_steps1 = np.array(fl["AllJumpISEnergy"])
            TSEs_steps1 = np.array(fl["AllJumpTSEnergy"])
            FSEs_steps1 = np.array(fl["AllJumpFSEnergy"])

        with h5py.File("{0}/{1}/states_{2}_{3}/data_{0}_2_{2}.h5".format(T, intervalRun, subStart, subEnd),
                       "r") as fl:
            FinStates_step2 = np.array(fl["FinalStates"])
            Disps_step2 = np.array(fl["SpecDisps"])
            times_step2 = np.array(fl["times"])
            allRates_step2 = np.array(fl["AllJumpRates"])
            allbarriers_step2 = np.array(fl["AllJumpBarriers"])
            rands_step2 = np.array(fl["TestRandNums"])
            JSelects_step2 = np.array(fl["JumpSelects"])

            ISEs_steps2 = np.array(fl["AllJumpISEnergy"])
            TSEs_steps2 = np.array(fl["AllJumpTSEnergy"])
            FSEs_steps2 = np.array(fl["AllJumpFSEnergy"])
        
        # store samples in necessary arrays
        finalStates_UT[subStart : subEnd] = FinStates_step1[:]
        
        SpecDisps[subStart : subEnd, :, :] = Disps_step1[:]
        SpecDisps_step2[subStart : subEnd, :, :] = Disps_step2[:]
        
        t_arr[subStart : subEnd] = times_step1[:]
        t_arr_step2[subStart : subEnd] = times_step2[:]
        
        AllJumpRates_Init[subStart : subEnd] = allRates_step1[:]
        AllJumpRates_Fin[subStart : subEnd] = allRates_step2[:]
        
        AllJumpBarriers_Init[subStart : subEnd] = allbarriers_step1[:]
        AllJumpBarriers_Fin[subStart : subEnd] = allbarriers_step2[:]
        
        JumpSelects[subStart : subEnd] = JSelects_step1[:]
        randomNumbers[subStart : subEnd] = rands_step1[:]
        
        JumpSelects_step2[subStart : subEnd] = JSelects_step2[:]
        randomNumbers_step2[subStart : subEnd] = rands_step2[:]
        
        # Do some checks of detailed balance and site swapping and store the periodically translated states too
        for samp in range(400):
            init_stp1 = initStates[subStart + samp]
            fin_stp1_UT = FinStates_step1[samp] # store the untranslated final state
            stp1Select = JSelects_step1[samp]
            fin_stp1 = init_stp1[JumpNewSites[stp1Select]] # store the periodically translated final state
            finalStates[subStart + samp] = fin_stp1
            
            # check detailed balance
            # get the transition state energy from the initial state
            x1 = ISEs_steps1[samp, JSelects_step1[samp]] +\
            allbarriers_step1[samp, JSelects_step1[samp]]
            
            # get the transition state energy from the final state
            inc = 1 if JSelects_step1[samp] % 2 == 0 else -1
            x2 = ISEs_steps2[samp, JSelects_step1[samp] + inc] +\
            allbarriers_step2[samp, JSelects_step1[samp] + inc]
            
            # check that the two transition state energies are same within tolerance
            assert np.allclose(x1, x2)
            
            # Check that the sites were swapped properly
            stp1Select = JSelects_step1[samp]
            stp2VacSite = NNsites_vac[stp1Select]
            
            dxStep1 = dxList[stp1Select]
            dxStep1R, _ = superFCC.crys.cart2pos(dxStep1)
            assert stp2VacSite == superFCC.index(dxStep1R, (0,0))[0]
            
            assert fin_stp1_UT[stp2VacSite] == 0
            assert fin_stp1_UT[0] == init_stp1[stp2VacSite]
            for siteInd in range(init_stp1.shape[0]):
                if siteInd == 0 or siteInd == stp2VacSite:
                    continue
                else:
                    assert fin_stp1_UT[siteInd] == init_stp1[siteInd]
            
            # Now for step 2
            fin_stp2_UT = FinStates_step2[samp] # store the untranslated final state
            
            NNsites_vac_stp2 = NNsites[1:, stp2VacSite]
            
            stp2Select = JSelects_step2[samp]
            stp3VacSite = NNsites_vac_stp2[stp2Select]
            
            dxStep2 = dxList[stp2Select]
            dxStep2R = superFCC.crys.cart2pos(dxStep2)[0] + dxStep1R
            assert stp3VacSite == superFCC.index(dxStep2R, (0,0))[0]
            
            assert fin_stp2_UT[stp3VacSite] == 0
            assert fin_stp2_UT[stp2VacSite] == fin_stp1_UT[stp3VacSite]
            
            for siteInd in range(fin_stp1_UT.shape[0]):
                if siteInd == stp2VacSite or siteInd == stp3VacSite:
                    continue
                else:
                    assert fin_stp1_UT[siteInd] == fin_stp2_UT[siteInd]

Nsamples: 20000
0 400
400 800
800 1200
1200 1600
1600 2000
2000 2400
2400 2800
2800 3200
3200 3600
3600 4000
4000 4400
4400 4800
4800 5200
5200 5600
5600 6000
6000 6400
6400 6800
6800 7200
7200 7600
7600 8000
8000 8400
8400 8800
8800 9200
9200 9600
9600 10000
10000 10400
10400 10800
10800 11200
11200 11600
11600 12000
12000 12400
12400 12800
12800 13200
13200 13600
13600 14000
14000 14400
14400 14800
14800 15200
15200 15600
15600 16000
16000 16400
16400 16800
16800 17200
17200 17600
17600 18000
18000 18400
18400 18800
18800 19200
19200 19600
19600 20000


In [7]:
assert np.allclose(AllJumpRates_Init, np.exp(- AllJumpBarriers_Init / (kB * T)))
assert np.allclose(AllJumpRates_Fin, np.exp(- AllJumpBarriers_Fin / (kB * T)))

In [8]:
# Store a permutation sequence to mix up all the states later if needed and then split into training and testing
perm = np.random.permutation(np.arange(initStates.shape[0]))

# Save the extracted data in a suitable format for neural network and cluster expansion training
with h5py.File("SingleStep_MEAM_{}_AllRates_2.h5".format(T), "w") as fl:
    fl.create_dataset("Permutation", data=perm)
    fl.create_dataset("InitStates", data=initStates)
    fl.create_dataset("FinStates", data=finalStates)
    fl.create_dataset("FinStates_UT", data=finalStates_UT)
    fl.create_dataset("SpecDisps", data=SpecDisps)
    fl.create_dataset("SpecDisps_step2", data=SpecDisps_step2) # forgot to add this the first time
    
    fl.create_dataset("AllJumpRates_Init", data=AllJumpRates_Init)
    fl.create_dataset("AllJumpRates_Fin", data=AllJumpRates_Fin)
    fl.create_dataset("AllJumpBarriers_Init", data=AllJumpBarriers_Init)
    fl.create_dataset("AllJumpBarriers_Fin", data=AllJumpBarriers_Fin)
    fl.create_dataset("times", data=t_arr)
    fl.create_dataset("rates", data=1./t_arr)
    fl.create_dataset("times_step2", data=t_arr_step2)
    fl.create_dataset("rates_step2", data=1./t_arr_step2)
    
    fl.create_dataset("JumpSelects", data=JumpSelects)
    fl.create_dataset("RandNums", data=randomNumbers)
    fl.create_dataset("JumpSelects_step2", data=JumpSelects_step2)
    fl.create_dataset("RandNums_step2", data=randomNumbers_step2)