### Program written by Scott Midgley, 2021
### Scope: To train and test LR models for mixing energy screening in the configuraional space of MgO-ZnO solid solutions. 

In [None]:
### USER INPUT REQUIRED ###

# Please paste in the path to the repositiory here an comment/uncomment as needed.
# E.g. rundir = r'C:\Users\<user>\Desktop\repository'

# Windows path
#repodir = r'<windows\path\here>'
repodir = r'C:\Users\smidg\Desktop\ml\repository'

#Unix path
#repodir = '<unix/path/here>'

In [None]:
### USER INPUT REQUIRED ###

# Define percentage of training data to use for training. 
split = 10
#split = 50
#split = 80
#split = 30

In [None]:
# Import modules. 
import pandas as pd
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
import pickle
import os
import time
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import os

In [None]:
# Start program timer.
start_time = time.time()

In [None]:
#datadir = ('C:/Users/smidg/Google Drive/PhD/machine-learning/sdm/all-data/cluster-expansion/data/full-ccf/chebyshev/')
# Define LR working directory. 
os.chdir(repodir)
os.chdir('cluster_correlation_functions')
os.chdir('lr')
os.chdir('EGAP')
lrdir = os.getcwd()
print(lrdir)

In [None]:
# Import data and shuffle (optional).
os.chdir('..')
os.chdir('..')
os.chdir('data')
os.chdir('rundir')
eners = pd.read_pickle('input_data_ccf.pkl')
eners = eners.sample(frac=1)
os.chdir(lrdir)

In [None]:
# Split data frame into training, validation, and testing data. 
if split == int(10):
    e_train = eners.iloc[1608:2412] #10% of data for training
elif split == int(30):
    e_train = eners.iloc[1608:4021] #30% of data for training
elif split == int(50):
    e_train = eners.iloc[1608:5630] #50% of data for training
elif split == int(80):
    e_train = eners.iloc[1608:] #80% of data for training
else:
    print('Error: please choose a valid train/test split.')
e_val = eners.iloc[804:1608] #10% of data for validation
e_test = eners.iloc[:804] #10% of data for testing model

In [None]:
# Convert Pandas columns to Numpy arrays. Reshaping to obtain array of nested brackets. 
Xtrain = e_train['cf_vector'].to_numpy()
ytrain = e_train['BGE'].to_numpy()
Xtrain = np.stack(Xtrain)
ytrain= np.stack(ytrain) 
Xval = e_val['cf_vector'].to_numpy()
yval = e_val['BGE'].to_numpy()
Xval = np.stack(Xval)
yval= np.stack(yval)
Xtest = e_test['cf_vector'].to_numpy()
ytest = e_test['BGE'].to_numpy()
Xtest = np.stack(Xtest)
ytest = np.stack(ytest)

In [None]:
# Define Lasso regularized regressor. 
lasso_reg = Lasso(alpha=0.000001)
lasso_reg.fit(Xtrain, ytrain)

In [None]:
# Start a dataframe for regressor metrics.
metrics_df = pd.DataFrame()

In [None]:
# Print regressor metrics.
r_sq = ('R^2:  ', lasso_reg.score(Xtrain, ytrain))
inter = ('intercept: ', lasso_reg.intercept_)
coef = ('coefficient:  ', lasso_reg.coef_)
print(r_sq)
print(inter)
print(coef)

In [None]:
# Test the regressor. 
ypred = lasso_reg.predict(Xtest)

In [None]:
# Add metrics to data frame. 
mae = ('MAE:  ', mean_absolute_error(ytest, ypred))
print('mae = ', mae)
metrics_df['mae'] = mae
metrics_df['r_sq'] = r_sq
metrics_df['coef'] = coef

In [None]:
# Save metrics data frame to file. 
metrics_df.to_csv('metrics.csv')

In [None]:
#Plot DFT mixing energy vs regressor predicted.
plt.scatter(ytest, ypred)
plt.xlabel('DFT')
plt.ylabel('Regressor')
#plt.savefig('BGE_pred_vs_val.png')

In [None]:
# Export data. 
df_data = pd.DataFrame()
df_data['ytest'] = ytest
df_data['ypred'] = ypred
df_data.to_excel('data.xlsx')

In [None]:
# Print time taken by program to run. 
time_s = round((time.time() - start_time), 2)
time_m = round((time_s/60), 2)
print(time_s,'sec')
print(time_m, 'min')