### Program written by Scott Midgley, 2021
### Scope: To train and test MLP models for band gap energy screening in the configuraional space of MgO-ZnO solid solutions. 

In [None]:
### USER INPUT REQUIRED ###

# Please paste in the path to the repositiory here an comment/uncomment as needed.
# E.g. rundir = r'C:\Users\<user>\Desktop\repository'

# Windows path
#repodir = r'<windows\path\here>'
repodir = r'C:\Users\smidg\Desktop\ml\repository'

#Unix path
#repodir = '<unix/path/here>'

In [None]:
### USER INPUT REQUIRED ###

# Define percentage of training data to use for training. 
split = 10
#split = 50
#split = 80
#split = 30

In [None]:
# Import modules.
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam #Stochasic gradient descent method optimising weights and activations
import copy
from tensorflow.keras.callbacks import ModelCheckpoint
import time
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.callbacks import EarlyStopping
import pickle
import os

In [None]:
# Clear any existing Tensorflow data from cache.
tf.keras.backend.clear_session()

In [None]:
# Start program timer. 
start_time = time.time()

In [None]:
# Define MLP working directory. 
os.chdir(repodir)
os.chdir('coulomb_matrix')
os.chdir('mlp')
mlpdir = os.getcwd()
print(mlpdir)

In [None]:
# Import .pkl data file and shuffle (optional).
os.chdir('..')
os.chdir('data')
os.chdir('rundir')
eners = pd.read_pickle('input_data.pkl')
#eners = eners.sample(frac=1)
os.chdir(mlpdir)

In [None]:
# Split data frame into training, validation, and testing data. 
if split == int(10):
    e_train = eners.iloc[1608:2412] #10% of data for training
elif split == int(30):
    e_train = eners.iloc[1608:4021] #30% of data for training
elif split == int(50):
    e_train = eners.iloc[1608:5630] #50% of data for training
elif split == int(80):
    e_train = eners.iloc[1608:] #80% of data for training
else:
    print('Error: please choose a valid train/test split.')
e_val = eners.iloc[804:1608] #10% of data for validation
e_test = eners.iloc[:804] #10% of data for testing model

In [None]:
# Print number of training data points
print('Number of training data points = ', len(e_train))

In [None]:
# Convert Pandas columns to Numpy arrays. Reshaping to obtain array of nested brackets. 
Xtrain = e_train['Coulomb'].to_numpy()
ytrain = e_train['BGE'].to_numpy()
Xtrain = np.stack(Xtrain)
ytrain= np.stack(ytrain) 
Xval = e_val['Coulomb'].to_numpy()
yval = e_val['BGE'].to_numpy()
Xval = np.stack(Xval)
yval= np.stack(yval)
Xtest = e_test['Coulomb'].to_numpy()
ytest = e_test['BGE'].to_numpy()
Xtest = np.stack(Xtest)
ytest = np.stack(ytest)

In [None]:
# Define MLP architecture. 
model = Sequential()
model.add(Dense(64, input_dim=Xtrain[0].shape[0], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(1, activation='linear'))

In [None]:
# Summarize model trainable parameters. 
model.summary()

In [None]:
# Define early stopping parameters. 
stopping =  EarlyStopping(monitor='val_mae', patience=750, verbose=0, mode='auto', restore_best_weights=True)

In [None]:
# Define MLP hyperparameters. 
ad = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, amsgrad=False) # Model creation
model.compile(loss='mae', optimizer=ad, metrics=['mae']) # Model config

In [None]:
# Train MLP. 
history = model.fit(Xtrain, ytrain, epochs=10000, batch_size=32, verbose=0, validation_data=(Xval, yval), callbacks=[stopping]) 

In [None]:
# Plote validation MAE vs number of training epochs. 
print(history.history.keys())
plt.plot(history.history['val_mae'])
plt.ylabel('Mean Absolute Error')
plt.xlabel('Training Epoch')
plt.ylim(0,1)
#plt.savefig('val-mae-zoom.png')

In [None]:
# Print minimum MAE and number of training epochs completed. 
mae = (np.min(history.history['val_mae']))
print('mae = ', mae)
epochs = (len((history.history['val_mae'])))
print('Epochs: ', epochs)

In [None]:
# Test model. 
ypred = model.predict(Xtest)

In [None]:
# Export data
df_data = pd.DataFrame()
df_data['ytest'] = ytest
df_data['ypred'] = ypred
metrics_df = pd.DataFrame()
metrics_df['metrics'] = mae, epochs

result = pd.concat([df_data, metrics_df], axis=1)
result.to_excel('result-mlp.xlsx')

In [None]:
# Print time taken by program to run. 
time_s = round((time.time() - start_time), 2)
time_m = round((time_s/60), 2)
print(time_s,'sec')
print(time_m, 'min')