### Program written by Scott Midgley, 2021
### Scope: To train and test GBDT models for band gap energy screening in the configuraional space of MgO-ZnO solid solutions. 

In [1]:
### USER INPUT REQUIRED ###

# Please paste in the path to the repositiory here an comment/uncomment as needed.
# E.g. rundir = r'C:\Users\<user>\Desktop\repository'

# Windows path
#repodir = r'<windows\path\here>'
repodir = r'C:\Users\smidg\Desktop\ml\repository'

#Unix path
#repodir = '<unix/path/here>' 

In [2]:
### USER INPUT REQUIRED ###

# Define percentage of training data to use for training. 
split = 10
#split = 50
#split = 80
#split = 30

In [3]:
# Import modules. 
import pandas as pd
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
import pickle
import time
from sklearn import datasets, ensemble
from sklearn import metrics 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV   
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import os

In [4]:
# Start program timer. 
start_time = time.time()

In [5]:
# Define GBDT working directory. 
os.chdir(repodir)
os.chdir('coulomb_matrix')
os.chdir('gbdt')
dtdir = os.getcwd()
print(dtdir)

C:\Users\smidg\Desktop\ml\repository\coulomb_matrix\gbdt


In [6]:
# Import data and shuffle (optional).
os.chdir('..')
os.chdir('data')
os.chdir('rundir')
eners = pd.read_pickle('input_data.pkl')
#eners = eners.sample(frac=1)
os.chdir(dtdir)

In [7]:
# Split data frame into training, validation, and testing data. 
if split == int(10):
    e_train = eners.iloc[1608:2412] #10% of data for training
elif split == int(30):
    e_train = eners.iloc[1608:4021] #30% of data for training
elif split == int(50):
    e_train = eners.iloc[1608:5630] #50% of data for training
elif split == int(80):
    e_train = eners.iloc[1608:] #80% of data for training
else:
    print('Error: please choose a valid train/test split.')
e_val = eners.iloc[804:1608] #10% of data for validation
e_test = eners.iloc[:804] #10% of data for testing model

In [8]:
# Convert Pandas columns to Numpy arrays. Reshaping to obtain array of nested brackets. 
Xtrain = e_train['Coulomb'].to_numpy()
ytrain = e_train['BGE'].to_numpy()
Xtrain = np.stack(Xtrain)
ytrain= np.stack(ytrain) 
Xval = e_val['Coulomb'].to_numpy()
yval = e_val['BGE'].to_numpy()
Xval = np.stack(Xval)
yval= np.stack(yval)
Xtest = e_test['Coulomb'].to_numpy()
ytest = e_test['BGE'].to_numpy()
Xtest = np.stack(Xtest)
ytest = np.stack(ytest)

In [9]:
# Define GBDT hyperparameters. 
params = {'n_estimators': 1000,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(Xtrain, ytrain)

GradientBoostingRegressor(learning_rate=0.01, max_depth=4, min_samples_split=5,
                          n_estimators=1000)

In [10]:
# Test GBDT model. 
ypred = reg.predict(Xtest)

In [11]:
# Print metrics.
mae = (mean_absolute_error(ytest, ypred))
print('mae = ', mae)

mae =  0.015473986745702677


In [12]:
# Export data.
df_data = pd.DataFrame()
df_data['ytest'] = ytest
df_data['ypred'] = ypred
metrics_list = []
metrics_list.append(mae)

metrics = pd.DataFrame()
metrics['metrics'] = metrics_list

result = pd.concat([df_data, metrics], axis=1)
result.to_excel('result.xlsx')

In [13]:
# Print time taken by program to run. 
time_s = round((time.time() - start_time), 2)
time_m = round((time_s/60), 2)
print(time_s,'sec')
print(time_m, 'min')

8.81 sec
0.15 min
