In [None]:
#Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec, keyedvectors

from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.linear_model import LinearRegression


In [None]:
#Import dataset
df = pd.read_excel("training_data.xlsx")

#Generate molecules
df['ROMol'] = df.apply(lambda x: Chem.MolFromSmiles(x['SMILES']), axis=1)

#Load pre-trained mol2vec model
model = word2vec.Word2Vec.load('mol2vec-master/examples/models/model_300dim.pkl')

df['sentence'] = df.apply(lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1)
df['mol2vec'] = [DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK')]

X_train, X_test, y_train, y_test = train_test_split(df[['SMILES','mol2vec']],df['value'],test_size=0.2,random_state=42)

X_train_vec = np.array([x.vec for x in X_train['mol2vec']])
X_test_vec = np.array([x.vec for x in X_test['mol2vec']])

y_train = np.array(y_train.astype(float))
y_test = np.array(y_test.astype(float))

Random Forests (RF)

In [None]:
#Hyperparameter tuning of RF model

#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
#Number of features to consider at every split
max_features = ['log2', 'sqrt',1.0]
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
#Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
#Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
#Method of selecting samples for training each tree
bootstrap = [True, False] 
#Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

#Initialise model
rf = RandomForestRegressor()
#Implement random search of parameters with 5-fold validation
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=1, random_state=0, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train_vec, y_train)

print(rf_random.best_params_)

In [None]:
#Implement grid search hyperparameter tuning

param_grid = {'n_estimators' : [1600,1700,1800],
'max_features' : ['sqrt'],
'max_depth' : [70],
'min_samples_split' : [2,3,4],
'min_samples_leaf' : [1],
'bootstrap' : [False]
}

#Initialise model
rf = RandomForestRegressor()
#Implment grid search
rf_grid = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 1)
rf_grid.fit(X_train_vec,y_train)
print(rf_grid.best_params_)

In [None]:
#perform 5-fold cross validation with parameters
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
for train,test in kf.split(X_train_vec,y_train):
    regr = RandomForestRegressor(n_estimators = 1800, max_depth = 70, min_samples_split = 2, max_features='sqrt', min_samples_leaf = 1, bootstrap = False, random_state = 0)
    regr.fit(X_train_vec[train],y_train[train])
    y_pred = regr.predict(X_train_vec[test])
    y_true = np.array(y_train[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #plot data
    # y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
    # plt.scatter(y_true,y_pred)
    # plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black')
    # plt.annotate("r2 = {:.3f}".format(r2_score(y_true, y_pred)), (200, -50))
    # plt.xlabel("Actual Tg ($^\circ$C)")
    # plt.ylabel("Predicted Tg ($^\circ$C)")
    # plt.show()
    # del regr

print(r2)
print(mae)
print(rmse)
print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")

In [None]:
#test against unseen data
regr.fit(X_train_vec,y_train)
y_pred = regr.predict(X_test_vec)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
# plt.annotate("r2 = {:.4f}".format(r2_score(y_true, y_pred)), (7.5, 1))
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

Support Vector Machine (SVM)

In [None]:
#hyperparameter tuning of SVM model

param_grid = {'kernel' : ['linear'],
              'C' : [0.1, 1, 10],
              'gamma' : [0.001, 0.01, 0.01,'scale','auto'],
              'epsilon' : [0.001, 0.01, 0.1, 1]
              }

#Initialise model
svr = svm.SVR()
#Perform grid search
svr_grid = GridSearchCV(estimator = svr, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 1)
#Fit grid search model
svr_grid.fit(X_train_vec,y_train)

print(svr_grid.best_params_)

In [None]:
#further tuning of SVM model

param_grid = {'kernel' : ['linear','rbf','poly','sigmoid'],
              'C' : [0.1],
              'gamma' : [0.001, 'auto'],
              'epsilon' : [0.01, 0.1, 1]
              }

#Initialise model
svr = svm.SVR()
#Perform grid search
svr_grid = GridSearchCV(estimator = svr, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 1)
#Fit grid search model
svr_grid.fit(X_train_vec,y_train)

print(svr_grid.best_params_)

In [None]:
#Perform 5-fold cross validation with parameters
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
for train,test in kf.split(X_train_vec,y_train):
    regr = svm.SVR(kernel="linear", C=0.1, tol=0.001, gamma=0.001,epsilon=0.1)
    regr.fit(X_train_vec[train],y_train[train])
    y_pred = regr.predict(X_train_vec[test])
    y_true = np.array(y_train[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #plot data
    # y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
    # plt.scatter(y_true,y_pred)
    # plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black')
    # plt.annotate("r2 = {:.3f}".format(r2_score(y_true, y_pred)), (200, -50))
    # plt.xlabel("Actual Tg ($^\circ$C)")
    # plt.ylabel("Predicted Tg ($^\circ$C)")
    # plt.show()
   

print(r2)
print(mae)
print(rmse)
print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")

In [None]:
#Test model on unseen data
regr.fit(X_train_vec,y_train)
y_pred = regr.predict(X_test_vec)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

Gaussian Process Regression (GPR)

In [None]:
#import GPR model
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, RBF

#initialise kernel
kernel = WhiteKernel(noise_level=0.1) + RBF(length_scale=10)

#perform 5-fold cross validation for MF
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
for train,test in kf.split(X_train_vec,y_train):
    regr = GaussianProcessRegressor(kernel=kernel,random_state=0)
    regr.fit(X_train_vec[train],y_train[train])
    y_pred = regr.predict(X_train_vec[test])
    y_true = np.array(y_train[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #plot data
    # y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
    # plt.scatter(y_true,y_pred)
    # plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black')
    # plt.annotate("r2 = {:.3f}".format(r2_score(y_true, y_pred)), (200, -50))
    # plt.xlabel("Actual Tg ($^\circ$C)")
    # plt.ylabel("Predicted Tg ($^\circ$C)")
    # plt.show()
    # del regr

print(r2)
print(mae)
print(rmse)
print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")

In [None]:
#test against unseen data
regr.fit(X_train_vec,y_train)
y_pred = regr.predict(X_test_vec)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

Convolutional Neural Network (CNN)

In [None]:
#import libraries
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D

#define parameters
batch_size = 32
epochs = 100
X_train_vec = X_train_vec.reshape(X_train_vec.shape[0],X_train_vec.shape[1],1)
X_test_vec = X_test_vec.reshape(X_test_vec.shape[0],X_test_vec.shape[1],1)

#implement model for MF
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(300, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='linear'))
#compile model
model.compile(optimizer='adam', loss='mean_squared_error')

#run k-cross validation
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
train_loss, val_loss = [], []
for train,test in kf.split(X_train_vec,y_train):
    hist = model.fit(X_train_vec[train],y_train[train],batch_size,epochs,verbose=0,validation_data=(X_train_vec[test], y_train[test]))
    y_pred = model.predict(X_train_vec[test])
    y_true = np.array(y_train[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #record loss values
    train_loss.append(hist.history['loss'])
    val_loss.append(hist.history['val_loss'])

#learning curve
plt.figure(figsize=(10, 5))
plt.plot(np.mean(train_loss, axis=0), label='Training Loss')
plt.plot(np.mean(val_loss, axis=0), label='Validation Loss')
plt.title('Learning Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")

In [None]:
#test against unseen data
model.fit(X_train_vec,y_train,batch_size,epochs,verbose=0)
y_pred = model.predict(X_test_vec)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()

Recurrent Neural Network (RNN)

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

#define parameters
batch_size = 32
epochs = 100
X_train_vec = X_train_vec.reshape(X_train_vec.shape[0],1,X_train_vec.shape[1])
X_test_vec = X_test_vec.reshape(X_test_vec.shape[0],1,X_test_vec.shape[1]) 

#implment model for MF
model = Sequential()
model.add(LSTM(64, input_shape=(1, 300)))
model.add(Dense(1, activation='linear'))
#compile model
model.compile(optimizer='adam', loss='mean_squared_error')

#run k-cross validation
kf = KFold(n_splits=5)
r2, mae, rmse = [], [], []
train_loss, val_loss = [], []
for train,test in kf.split(X_train_vec,y_train):
    hist = model.fit(X_train_vec[train],y_train[train],batch_size,epochs,verbose=0,validation_data=(X_train_vec[test], y_train[test]))
    y_pred = model.predict(X_train_vec[test])
    y_true = np.array(y_train[test])
    r2.append(r2_score(y_true,y_pred))
    mae.append(mean_absolute_error(y_true,y_pred))
    rmse.append(mean_squared_error(y_true,y_pred,squared=False))

    #record loss values
    train_loss.append(hist.history['loss'])
    val_loss.append(hist.history['val_loss'])

#learning curve
plt.figure(figsize=(10, 5))
plt.plot(np.mean(train_loss, axis=0), label='Training Loss')
plt.plot(np.mean(val_loss, axis=0), label='Validation Loss')
plt.title('Learning Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

print(f"Average r2 score: {np.mean(r2)}\nAverage MAE: {np.mean(mae)}\nAverage RMSE: {np.mean(rmse)}")

In [None]:
#test against unseen data
model.fit(X_train_vec,y_train,batch_size,epochs,verbose=0)
y_pred = model.predict(X_test_vec)
y_true = np.array(y_test)

print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))

In [None]:
#plot data
y_true, y_pred = y_true.reshape(-1,1), y_pred.reshape(-1,1)
plt.scatter(y_true,y_pred,s=10,alpha=0.9)
plt.plot(y_true, LinearRegression().fit(y_true, y_pred).predict(y_true),color='black',alpha=0.7)
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.show()