In [None]:
#Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs

from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.linear_model import LinearRegression

In [None]:
#Import dataset
df = pd.read_excel("training_data.xlsx")

#Generate molecules
df['ROMol'] = df.apply(lambda x: Chem.MolFromSmiles(x['SMILES']), axis=1)

#Generate Morgan Fingerprints (MF) and convert RDKit object to numpy array - Source: https://github.com/rdkit/UGM_2016/blob/master/Tutorials/Part3_Fingerprints_and_classification.ipynb
def computeMF(mol, depth=2, nBits=2048):
    arr = np.zeros(nBits)
    try:
      DataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(mol,depth,nBits),arr)
    except:
      return None
    return arr

df['MF'] = df.apply(lambda x: computeMF(x['ROMol']),axis=1)

#Generate Morgan Fingerprints with Frequency (MFF) and convert RDKit object to numpy array
def computeMFF(mol, depth=2, nBits=2048):
    arr = np.zeros(nBits)
    try:
      DataStructs.ConvertToNumpyArray(AllChem.GetHashedMorganFingerprint(mol,depth,nBits),arr)
    except:
      return None
    return arr

df['MFF'] = df.apply(lambda x: computeMFF(x['ROMol']),axis=1)

X_train, X_test, y_train, y_test = train_test_split(df[['SMILES','MF','MFF']],df['value'],test_size=0.2,random_state=42)

X1 = np.array(X_train['MF'].values.tolist())
X2 = np.array(X_train['MFF'].values.tolist())
y = np.array(y_train.astype(float))

X1_test = np.array(X_test['MF'].values.tolist())
X2_test = np.array(X_test['MFF'].values.tolist())
y_test = np.array(y_test.astype(float))


Random Forests (RF)

In [None]:
#Hyperparameter tuning of RF model for MF

#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
#Number of features to consider at every split
max_features = ['log2', 'sqrt',1.0]
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
#Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
#Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
#Method of selecting samples for training each tree
bootstrap = [True, False] 
#Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestRegressor()
#Implement random search of parameters with 5-fold validation
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=1, random_state=0, n_jobs = -1)
# Fit the random search model
rf_random.fit(X1, y)

print(rf_random.best_params_)

In [None]:
#Implement grid search hyperparameter tuning
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators' : [1500,1600,1700],
'max_features' : ['sqrt'],
'max_depth' : [70],
'min_samples_split' : [2],
'min_samples_leaf' : [1],
'bootstrap' : [False]
}

#Initialise model
rf = RandomForestRegressor()
#Implment grid search
rf_grid = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 1)
rf_grid.fit(X1,y)
print(rf_grid.best_params_)

In [None]:
#perform 5-fold cross validation with parameters
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
for train,test in kf.split(X1,y):
    regr = RandomForestRegressor(n_estimators = 1600, max_depth = 70, min_samples_split = 3, max_features='sqrt', min_samples_leaf = 1, bootstrap = False, random_state = 0)
    # regr = RandomForestRegressor(n_estimators = 100, max_depth = 10, min_samples_split = 2, random_state = 0)
    regr.fit(X1[train],y[train])
    y_pred = regr.predict(X1[test])
    y_true = np.array(y[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    
print(r2)
print(mae)
print(rmse)
print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")

#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

In [None]:
#test against unseen data
regr.fit(X1,y)
y_pred = regr.predict(X1_test)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

In [None]:
#Hyperparameter tuning of RF model for MFF

#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
#Number of features to consider at every split
max_features = ['log2', 'sqrt',1.0]
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
#Minimum number of samples required to split a node
min_samples_split = [2, 3, 5]
#Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
#Method of selecting samples for training each tree
bootstrap = [True, False] 
#Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestRegressor()
#Implement random search of parameters with 5-fold validation
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=1, random_state=0, n_jobs = -1)
#Fit the random search model
rf_random.fit(X2, y)

print(rf_random.best_params_)

In [None]:
#Implement grid search hyperparameter tuning

param_grid = {'n_estimators' : [1700,1800,1900],
'max_features' : ['sqrt'],
'max_depth' : [70],
'min_samples_split' : [2,3],
'min_samples_leaf' : [1],
'bootstrap' : [False]
}

#Initialise model
rf = RandomForestRegressor()
#Implment grid search
rf_grid = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 1)
rf_grid.fit(X2,y)
print(rf_grid.best_params_)

In [None]:
#perform 5-fold cross validation with parameters
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
for train,test in kf.split(X2,y):
    regr = RandomForestRegressor(n_estimators = 1800, max_depth = 70, min_samples_split = 2, max_features='sqrt', min_samples_leaf = 1, bootstrap = False, random_state = 0)
    regr.fit(X2[train],y[train])
    y_pred = regr.predict(X2[test])
    y_true = np.array(y[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #plot data (optional)
    # y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
    # plt.scatter(y_true,y_pred)
    # plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black')
    # plt.annotate("r2 = {:.3f}".format(r2_score(y_true, y_pred)), (200, -50))
    # plt.xlabel("Actual Tg ($^\circ$C)")
    # plt.ylabel("Predicted Tg ($^\circ$C)")
    # plt.show()
    # del regr

print(r2)
print(mae)
print(rmse)
print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")

In [None]:
#test against unseen data
regr = RandomForestRegressor(n_estimators = 1800, max_depth = 70, min_samples_split = 2, max_features='sqrt', min_samples_leaf = 1, bootstrap = False, random_state = 0)
regr.fit(X2,y)
y_pred = regr.predict(X2_test)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

#save model
X_comb = np.concatenate([X2,X2_test])
y_comb = np.concatenate([y,y_test])
regr.fit(X_comb,y_comb)
pickle.dump(regr, open("Eg_C_Predictor.model","wb"))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

Support Vector Machines (SVM)

In [None]:
#hyperparameter tuning of SVM model for MF

param_grid = {'kernel' : ['linear'],
              'C' : [1, 10, 100],
              'gamma' : [0.001, 0.01, 0.01,'scale','auto'],
              'epsilon' : [0.001, 0.01, 0.1, 1]
              }

#Create model
svr = svm.SVR()
#Perform grid search
svr_grid = GridSearchCV(estimator = svr, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 1)
#Fit grid search model
svr_grid.fit(X1,y)

print(svr_grid.best_params_)

In [None]:
#further tuning of SVM model 

param_grid = {'kernel' : ['linear'],
              'C' : [0.1,1,10],
              'gamma' : [0.001, 0.01],
              'epsilon' : [0.001,0.01]
              }

#Initialise model
svr = svm.SVR()
#Perform grid search
svr_grid = GridSearchCV(estimator = svr, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 1)
#Fit grid search model
svr_grid.fit(X1,y)

print(svr_grid.best_params_)

In [None]:
param_grid = {'kernel' : ['rbf','linear','poly'],
              'C' : [0.1,1],
              'gamma' : [0.001],
              'epsilon' : [0.001,0.01]
              }

#Initialise model
svr = svm.SVR()
#Perform grid search
svr_grid = GridSearchCV(estimator = svr, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 1)
#Fit grid search model
svr_grid.fit(X1,y)

print(svr_grid.best_params_)

In [None]:
#Perform 5-fold cross validation with parameters
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
for train,test in kf.split(X1,y):
    regr = svm.SVR(kernel="linear", C=0.1, tol=0.001, gamma=0.001,epsilon=0.1)
    regr.fit(X1[train],y[train])
    y_pred = regr.predict(X1[test])
    y_true = np.array(y[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #plot data
    y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
    plt.scatter(y_true,y_pred,s=10,alpha=0.9)
    plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
    plt.annotate("r2 = {:.4f}".format(r2_score(y_true, y_pred)), (7.5, 1))
    plt.xlabel("Actual Bandgap (eV)")
    plt.ylabel("Predicted Bandgap (eV)")
    plt.show()
   

print(r2)
print(mae)
print(rmse)
print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")

In [None]:
#Test model on unseen data
regr.fit(X1,y)
y_pred = regr.predict(X1_test)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
# plt.annotate("r2 = {:.4f}".format(r2_score(y_true, y_pred)), (7.5, 1))
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

In [None]:
#further tuning of SVM model for MFF

param_grid = {'kernel' : ['linear'],
              'C' : [0.01,0.1,1],
              'gamma' : [0.001, 0.01, 0.01],
              'epsilon' : [0.001, 0.01, 0.1, 1]
              }

#Create model
svr = svm.SVR()
#Perform grid search
svr_grid = GridSearchCV(estimator = svr, param_grid = param_grid, cv = 5, verbose = 1)
#Fit grid search model
svr_grid.fit(X2,y)

print(svr_grid.best_params_)

In [None]:
#Perform 5-fold cross validation with parameters
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
for train,test in kf.split(X2,y):
    regr = svm.SVR(kernel="linear", C=0.05, tol=0.001, gamma=0.001,epsilon=0.1)
    regr.fit(X2[train],y[train])
    y_pred = regr.predict(X2[test])
    y_true = np.array(y[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #plot data
    # y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
    # plt.scatter(y_true,y_pred)
    # plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black')
    # plt.annotate("r2 = {:.3f}".format(r2_score(y_true, y_pred)), (200, -50))
    # plt.xlabel("Actual Tg ($^\circ$C)")
    # plt.ylabel("Predicted Tg ($^\circ$C)")
    # plt.show()
   

print(r2)
print(mae)
print(rmse)
print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")

In [None]:
#Test model on unseen data
regr.fit(X2,y)
y_pred = regr.predict(X2_test)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
# plt.annotate("r2 = {:.4f}".format(r2_score(y_true, y_pred)), (7.5, 0.2))
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

Gaussian Process Regression (GPR)

In [None]:
#import GPR model
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, RBF

#initialise kernel
kernel = WhiteKernel(noise_level=0.1) + RBF(length_scale=10)

#perform 5-fold cross validation for MF
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
for train,test in kf.split(X1,y):
    regr = GaussianProcessRegressor(kernel=kernel,random_state=0)
    regr.fit(X1[train],y[train])
    y_pred = regr.predict(X1[test])
    y_true = np.array(y[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #plot data
    # y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
    # plt.scatter(y_true,y_pred)
    # plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black')
    # plt.annotate("r2 = {:.3f}".format(r2_score(y_true, y_pred)), (200, -50))
    # plt.xlabel("Actual Tg ($^\circ$C)")
    # plt.ylabel("Predicted Tg ($^\circ$C)")
    # plt.show()
    # del regr

print(r2)
print(mae)
print(rmse)
print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")

In [None]:
#test against unseen data
regr.fit(X1,y)
y_pred = regr.predict(X1_test)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

In [None]:
#initialise kernel
kernel = WhiteKernel(noise_level=0.1) + RBF(length_scale=10)

#perform 5-fold cross validation for MFF
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
for train,test in kf.split(X2,y):
    regr = GaussianProcessRegressor(kernel=kernel,random_state=0)
    regr.fit(X2[train],y[train])
    y_pred = regr.predict(X2[test])
    y_true = np.array(y[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #plot data
    # y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
    # plt.scatter(y_true,y_pred)
    # plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black')
    # plt.annotate("r2 = {:.3f}".format(r2_score(y_true, y_pred)), (200, -50))
    # plt.xlabel("Actual Tg ($^\circ$C)")
    # plt.ylabel("Predicted Tg ($^\circ$C)")
    # plt.show()
    # del regr

print(r2)
print(mae)
print(rmse)
print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")

In [None]:
#test against unseen data
regr.fit(X2,y)
y_pred = regr.predict(X2_test)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

Convolutional Neural Network (CNN)

In [None]:
#import libraries
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D

#define parameters
batch_size = 32
epochs = 100
X1 = X1.reshape(X1.shape[0],X1.shape[1],1)
X1_test = X1_test.reshape(X1_test.shape[0],X1_test.shape[1],1) 

#implement model for MF
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(2048, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='linear'))
#compile model
model.compile(optimizer='adam', loss='mean_squared_error')

#run k-cross validation
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
train_loss, val_loss = [], []
for train,test in kf.split(X1,y):
    hist = model.fit(X1[train],y[train],batch_size,epochs,verbose=0,validation_data=(X1[test], y[test]))
    y_pred = model.predict(X1[test])
    y_true = np.array(y[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #record loss values
    train_loss.append(hist.history['loss'])
    val_loss.append(hist.history['val_loss'])

#learning curve
plt.figure(figsize=(10, 5))
plt.plot(np.mean(train_loss, axis=0), label='Training Loss')
plt.plot(np.mean(val_loss, axis=0), label='Validation Loss')
plt.title('Learning Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")

In [None]:
#test against unseen data
model.fit(X1,y,batch_size,epochs,verbose=0)
y_pred = model.predict(X1_test)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

In [None]:
#import libraries
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D

#define parameters
batch_size = 32
epochs = 100
X2 = X2.reshape(X2.shape[0],X2.shape[1],1)
X2_test = X2_test.reshape(X2_test.shape[0],X2_test.shape[1],1)

#implement model for MFF
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(2048, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='linear'))
#compile model
model.compile(optimizer='adam', loss='mean_squared_error')

#run k-cross validation
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
train_loss, val_loss = [], []
for train,test in kf.split(X2,y):
    hist = model.fit(X2[train],y[train],batch_size,epochs,verbose=0,validation_data=(X2[test], y[test]))
    y_pred = model.predict(X2[test])
    y_true = np.array(y[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #record loss values
    train_loss.append(hist.history['loss'])
    val_loss.append(hist.history['val_loss'])

#learning curve
plt.figure(figsize=(10, 5))
plt.plot(np.mean(train_loss, axis=0), label='Training Loss')
plt.plot(np.mean(val_loss, axis=0), label='Validation Loss')
plt.title('Learning Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")

In [None]:
#test against unseen data
model.fit(X2,y,batch_size,epochs,verbose=0)
y_pred = model.predict(X2_test)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

Recurrent Neural Network (RNN)

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

#define parameters
batch_size = 32
epochs = 100
X1 = X1.reshape(X1.shape[0],1,X1.shape[1])
X1_test = X1_test.reshape(X1_test.shape[0],1,X1_test.shape[1]) 

#implment model for MF
model = Sequential()
model.add(LSTM(64, input_shape=(1, 2048)))
model.add(Dense(1, activation='linear'))
#compile model
model.compile(optimizer='adam', loss='mean_squared_error')

#run k-cross validation
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
train_loss, val_loss = [], []
for train,test in kf.split(X1,y):
    hist = model.fit(X1[train],y[train],batch_size,epochs,verbose=0,validation_data=(X1[test], y[test]))
    y_pred = model.predict(X1[test])
    y_true = np.array(y[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #record loss values
    train_loss.append(hist.history['loss'])
    val_loss.append(hist.history['val_loss'])

#learning curve
plt.figure(figsize=(10, 5))
plt.plot(np.mean(train_loss, axis=0), label='Training Loss')
plt.plot(np.mean(val_loss, axis=0), label='Validation Loss')
plt.title('Learning Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")


In [None]:
#test against unseen data
model.fit(X1,y,batch_size,epochs,verbose=0)
y_pred = model.predict(X1_test)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

#define parameters
batch_size = 32
epochs = 100
X2 = X2.reshape(X2.shape[0],1,X2.shape[1])
X2_test = X2_test.reshape(X2_test.shape[0],1,X2_test.shape[1]) 

#implment model for MFF
model = Sequential()
model.add(LSTM(64, input_shape=(1, 2048)))
model.add(Dense(1, activation='linear'))
#compile model
model.compile(optimizer='adam', loss='mean_squared_error')

#run k-cross validation
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
train_loss, val_loss = [], []
for train,test in kf.split(X2,y):
    hist = model.fit(X2[train],y[train],batch_size,epochs,verbose=0,validation_data=(X2[test], y[test]))
    y_pred = model.predict(X2[test])
    y_true = np.array(y[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #record loss values
    train_loss.append(hist.history['loss'])
    val_loss.append(hist.history['val_loss'])

#learning curve
plt.figure(figsize=(10, 5))
plt.plot(np.mean(train_loss, axis=0), label='Training Loss')
plt.plot(np.mean(val_loss, axis=0), label='Validation Loss')
plt.title('Learning Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")


In [None]:
#test against unseen data
model.fit(X2,y,batch_size,epochs,verbose=0)
y_pred = model.predict(X2_test)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

CNN (with Dropout)

In [None]:
#import libraries
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

#define parameters
batch_size = 32
epochs = 100
X2 = X2.reshape(X2.shape[0],X2.shape[1],1)
X2_test = X2_test.reshape(X2_test.shape[0],X2_test.shape[1],1)

#implement model for MFF
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(2048, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))  # Adding dropout layer
model.add(Conv1D(128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))  # Adding dropout layer
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))   # Adding dropout layer
model.add(Dense(1, activation='linear'))
#compile model
model.compile(optimizer='adam', loss='mean_squared_error')

#run k-cross validation
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
train_loss, val_loss = [], []
for train,test in kf.split(X1,y):
    hist = model.fit(X1[train],y[train],batch_size,epochs,verbose=0,validation_data=(X2[test], y[test]))
    y_pred = model.predict(X1[test])
    y_true = np.array(y[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #record loss values
    train_loss.append(hist.history['loss'])
    val_loss.append(hist.history['val_loss'])

#learning curve
plt.figure(figsize=(10, 5))
plt.plot(np.mean(train_loss, axis=0), label='Training Loss')
plt.plot(np.mean(val_loss, axis=0), label='Validation Loss')
plt.title('Learning Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")


In [None]:
#test against unseen data
model.fit(X2,y,batch_size,epochs,verbose=0)
y_pred = model.predict(X2_test)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))