### Program written by Scott Midgley, 2021
Scope: To ingest VASP energies from .csv format and combine with cluster correlation functions generated via the Chebyshev           method. Output saved as .pkl file, ready for machine learning models.

In [1]:
### USER INPUT REQUIRED ###

# Please paste in the path to the repositiory here an comment/uncomment as needed.
# E.g. rundir = r'C:\Users\<user>\Desktop\repository'

# Windows path
#repodir = r'<windows\path\here>'
#repodir = r'C:\Users\smidg\Desktop\ml\repository'

# Unix path
repodir = r'/home/mts87985/ml-thermo/Machine-Learning-for-Solid-Solutions/'

In [2]:
# Import modules. 
import pandas as pd
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
import pickle
import time
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import os

In [3]:
# Read DFT derived energies from .csv file to data frame.
energies = pd.read_csv(repodir + "repository-data/vasp-energies.csv", header=None)
energies.columns = ['SCF', 'BGE']

In [4]:
# Read in cluster correaltion function data. 
df = pd.read_csv(repodir + 'repository-data/cluster_correlation_functions/correlationmatrix_chebyshev.csv')

In [5]:
# Convert pandas rows to list of values. 
df_list = df.values.tolist()
df['cf_vector'] = df_list

In [6]:
# Join energies and CCF data frames, and shuffle (optional). 
df3 = pd.concat([df, energies], axis=1, sort=False)
#df3 = df3.sample(frac=1)

In [7]:
df3.to_pickle('input_data_ccf.pkl')