# Intro
This workflow was designed to allow a potential to be trained to predict relaxed total energies from unrelaxed structures (named 'POSCAR_orig' here.). It is shared in case it can be edited to suit other future uses when it is necessary to customise the cfg files for MTP.

## Requirements:
- python3
- ase

## User inputs:
- Define location of directories with structure data
- Set a size for the training and test sets (note these may need to be edited after checking for convergence, etc. later in the workflow)
- Choose names for the cfg files containing the train and test set structures

In [None]:
import os
import glob
import ase.io as io
import ase as ase
import random

trainSize = 330
testSize = 140
trainFile="oneElongOnly_train330.cfg"
testFile="oneElongOnly_test140.cfg"

data_dir = '/home/suzannekwallace/Projects/Co_xMn_{3-x}O_4/comparisonStudy_NNandMTP/cleanedSetsABC/labelled/javi_oneElongOnly_orig'
list_to_read = glob.glob(os.path.join(data_dir,'*/con*/'), recursive=True)

In [2]:
def writeCFG(fileList: list, outFile: str):
    
    cfgFile=open(outFile,"w+")
    
    for datum in fileList:
        pos = os.path.join(datum, 'POSCAR_orig')
        out = os.path.join(datum, 'OUTCAR')
        if os.path.isfile(pos) and os.path.isfile(out):
            try:
                outStruc = io.read(out, index='-1', format='vasp-out')
                posStruc = io.read(pos, format='vasp')
                energy = outStruc.get_potential_energy()
                cell = posStruc.get_cell()
                positions = posStruc.get_positions()
                symbols = posStruc.get_chemical_symbols()
                ids = range(1, len(symbols)+1)
                types = []
                for symbol in symbols:
                    if (symbol == 'Co'):
                        types.append(0)
                    elif (symbol == 'Mn'):
                        types.append(1)
                    elif (symbol == 'O'):
                        types.append(2)
                    else:
                        print('Not Co, Mn, or O...?')       
            except:
                print('Failed to extract info or missing POSCAR_orig or OUTCAR in '+str(datum))

            print('',file=cfgFile)
            print('BEGIN_CFG',file=cfgFile)
            print(' Size',file=cfgFile)
            print('    ',len(symbols),file=cfgFile)
            print(' Supercell',file=cfgFile)
            for v1, v2, v3 in cell:
                print('        ',v1,'          ',v2,'          ',v3,file=cfgFile)
            print(' AtomData:  id type       cartes_x      cartes_y      cartes_z',file=cfgFile)
            for id,type,[p1,p2,p3] in zip(ids, types, positions):
                print('            ',id,'    ',type,'    ',p1,'    ',p2,'    ',p3,file=cfgFile)
            print(' Energy',file=cfgFile)
            print('        ',energy,file=cfgFile)
            print('Feature comment   ',datum,file=cfgFile) # Add comment for original data location
            print('END_CFG',file=cfgFile)
        
    cfgFile.close()

## Check that files can be read sucessfully by ase:

In [3]:
verifiedList = []
for entry in list_to_read:
        pos = os.path.join(entry, 'POSCAR_orig')
        out = os.path.join(entry, 'OUTCAR')
        if os.path.isfile(pos) and os.path.isfile(out):
            try:
                outStruc = io.read(out, index='-1', format='vasp-out')
                posStruc = io.read(pos, format='vasp')
            except:
                print("Error reading files with ase from "+str(entry))
            else: # Only append list is try to read files with ase succeeded
                verifiedList.append(entry)

Error reading files with ase from /home/suzannekwallace/Projects/Co_xMn_{3-x}O_4/comparisonStudy_NNandMTP/cleanedSetsABC/labelled/javi_oneElongOnly_orig/block_B.2/con0.54/
479
472
330
140


## Check that calculations had converged:

In [None]:
import re

converged = []
notConverged = []
for entry in verifiedList:
    outcar = open(os.path.join(entry, 'OUTCAR'), 'r')
    convCheck = re.findall(r'reached required accuracy - stopping structural energy minimisation', outcar.read())
    convCheck = ''.join(convCheck)
    if (convCheck == 'reached required accuracy - stopping structural energy minimisation'):
        converged.append(entry)
    else:
        notConverged.append(entry)
        #print(convCheck)
    outcar.close()
    
print("The number that reached geometrical convergence is: "+str(len(converged)))
print('Only these structures will be used to generate train and test sets in the following cells.')

In [None]:
if (trainSize + testSize > len(converged)):
    print("Train set + test set > total data set size of "+str(len(converged)))

random.shuffle(converged)
trainSet = converged[0:trainSize]
testSet = converged[trainSize: trainSize+testSize]

#print(len(list_to_read))
#print(len(verifiedList))
#print(len(converged))
#print(len(trainSet))
#print((len(testSet)))

In [4]:
writeCFG(fileList=trainSet, outFile=trainFile)
writeCFG(fileList=testSet, outFile=testFile)