### Program written by Scott Midgley, 2021
### Scope: To ingest VASP energies from .csv format and generate Coulomb matrix eigenspectrum from POSCAR structure files. Output              saved as .pkl file, ready for machine learning models.

In [1]:
### USER INPUT REQUIRED ###

# Please paste in the path to the repositiory here an comment/uncomment as needed.
# E.g. rundir = r'C:\Users\<user>\Desktop\repository'

# Windows path
#repodir = r'<windows\path\here>'
repodir = r'C:\Users\smidg\Desktop\ml\repository'

#Unix path
#repodir = '<unix/path/here>'

In [2]:
# Import modules.
import pandas as pd
import os
import numpy as np
from pymatgen import Structure
from matminer.featurizers import structure as sf
import time

In [3]:
# Start program timer.
start_time = time.time()

In [4]:
# Change directory to rundir
os.chdir(repodir)
os.chdir('coulomb_matrix')
os.chdir('data')
os.chdir('rundir')
rundir = os.getcwd()

In [5]:
# Read DFT derived energies from .csv file to data frame.
os.chdir('..')
os.chdir('datadir')
energies = pd.read_csv("vasp-energies.csv", header=None)
energies.columns = ['SCF Energy', 'BGE']

In [6]:
# Go to structure file directory.
os.chdir('structure_files')
cwd = os.getcwd()

In [7]:
# Iterate over structures in structure directory, generating CME for each configuration.
cm_list = []
files = [f for f in os.listdir('.') if os.path.isfile(f)]
for f in files:
    struct = Structure.from_file(f)
    cm = sf.CoulombMatrix()
    fitted_matrix = cm.fit([struct])
    featurized_structure = fitted_matrix.featurize(struct)
    cm_list.append(np.sort(featurized_structure)[::-1])
print('Number of matrices read: ', len(cm_list))



Number of matrices read:  10


In [8]:
os.chdir('..')
os.chdir('..')
os.chdir('rundir')

In [9]:
# Add CME's to data frame with DFT energies. 
ener = energies.iloc[:len(cm_list)]
ener["Coulomb"] = cm_list

In [10]:
# Shuffle data frame (optional).
ener = ener.sample(frac=1)

In [11]:
# Save data frame to .pkl file.
ener.to_pickle('input_data.pkl')

In [12]:
# Print time taken by program to run. 
time_s = round((time.time() - start_time), 2)
time_m = round((time_s/60), 2)
print(time_s,'sec')
print(time_m, 'min')

2.5 sec
0.04 min
