In [1]:
import numpy as np
import pandas as pd
import sklearn
from rdkit import Chem

import xgboost as xgboost
import catboost as catboost
from sklearn import ensemble
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from bayes_opt import BayesianOptimization
from rdkit.Chem import AllChem

In [2]:
def make_test_set_split_only_by_mol(data, test_ratio, random_state = 42):
        mol_class = data["Chromophore_smiles"].drop_duplicates()
        mol_train = mol_class.sample(frac = (1-test_ratio), random_state = random_state)
        mol_test = mol_class.loc[list(set(list(mol_class.index))-set(list(mol_train.index)))]
        
        cv_set = pd.DataFrame()
        for mol in mol_train:
            cv_set = pd.concat([cv_set, data[data["Chromophore_smiles"] == mol]],axis=0)

        test_set = pd.DataFrame()
        for mol in mol_test:
            test_set = pd.concat([test_set, data[data["Chromophore_smiles"] == mol]],axis=0)

        return cv_set, test_set

# Absoption Wavelength

In [3]:
cuma_branch_sol_abs = pd.read_csv("../data/cuma_branch in D4C DB/cuma_branch_sol_abs.csv")
cv_set, test_set = make_test_set_split_only_by_mol(cuma_branch_sol_abs, 0.2)

# Optimized Parameters Applied
The model is using the optimized parameters obtained through the GWGC-optimization process.

## Test set performacne: XGBoost

In [None]:
# GWGC parameters
from functions.conv_input_Gauss_function import one_touch_representation_Gauss
input_final = one_touch_representation_Gauss(cuma_branch_sol_abs,  6,  6*( 8.154810535610794/10), True)

# test set
Y = np.array(cuma_branch_sol_abs['Abs']).reshape(-1,1)
scaler = StandardScaler()
scaler.fit(Y)

cv_idx=list(cv_set.index)
cv_input = input_final[cv_idx,:]
cv_label = np.array(cv_set['Abs']).flatten()
scaled_cv_label = scaler.transform(cv_label.reshape(-1,1))

test_idx=list(test_set.index)
test_input = input_final[test_idx,:]
test_label = np.array(test_set['Abs']).flatten()

xgb = xgboost.XGBRegressor(n_estimators = 82, learning_rate= 0.17041209198212898, subsample =0.839665562703557, max_depth = 9, random_state=42)

xgb.fit(cv_input, scaled_cv_label.flatten())
xgb_predict=xgb.predict(test_input)
xgb_mae= metrics.mean_absolute_error(test_label, scaler.inverse_transform(xgb_predict.reshape(-1,1)))
xgb_rmse= np.sqrt(metrics.mean_squared_error(test_label, scaler.inverse_transform(xgb_predict.reshape(-1,1))))
xgb_r2= metrics.r2_score(test_label, scaler.inverse_transform(xgb_predict.reshape(-1,1)))
print(f'Abs_XGB_MAE : {xgb_mae}\nAbs_XGB_RMSE : {xgb_rmse}\nAbs_XGB_R2 :{xgb_r2}' )

## Test set performance: CatBoost

In [None]:
# GWGC parameters
from functions.conv_input_Gauss_function import one_touch_representation_Gauss
input_final = one_touch_representation_Gauss(cuma_branch_sol_abs, 11,  11*( 1.1242731721231125/10), True)

# test set
Y = np.array(cuma_branch_sol_abs['Abs']).reshape(-1,1)
scaler = StandardScaler()
scaler.fit(Y)

cv_idx=list(cv_set.index)
cv_input = input_final[cv_idx,:]
cv_label = np.array(cv_set['Abs']).flatten()
scaled_cv_label = scaler.transform(cv_label.reshape(-1,1))

test_idx=list(test_set.index)
test_input = input_final[test_idx,:]
test_label = np.array(test_set['Abs']).flatten()

cat = catboost.CatBoostRegressor(silent = True, learning_rate=0.07148493031894261, depth = 7, l2_leaf_reg = 3)

cat.fit(cv_input, scaled_cv_label.flatten())
cat_predict=cat.predict(test_input)
cat_mae= metrics.mean_absolute_error(test_label, scaler.inverse_transform(cat_predict.reshape(-1,1)))
cat_rmse= np.sqrt(metrics.mean_squared_error(test_label, scaler.inverse_transform(cat_predict.reshape(-1,1))))
cat_r2= metrics.r2_score(test_label, scaler.inverse_transform(cat_predict.reshape(-1,1)))
print(f'Abs_Cat_MAE : {cat_mae}\nAbs_Cat_RMSE : {cat_rmse}\nAbs_Cat_R2 :{cat_r2}' )

## Test set performance: RandomForest

In [None]:
# GWGC parameters
from functions.conv_input_Gauss_function import one_touch_representation_Gauss
input_final = one_touch_representation_Gauss(cuma_branch_sol_abs, 2,  2*( 8.681260573682142/10), True)

# test set
Y = np.array(cuma_branch_sol_abs['Abs']).reshape(-1,1)
scaler = StandardScaler()
scaler.fit(Y)

cv_idx=list(cv_set.index)
cv_input = input_final[cv_idx,:]
cv_label = np.array(cv_set['Abs']).flatten()
scaled_cv_label = scaler.transform(cv_label.reshape(-1,1))

test_idx=list(test_set.index)
test_input = input_final[test_idx,:]
test_label = np.array(test_set['Abs']).flatten()

np.random.seed(42)
random_states_list = np.random.choice(range(0, 1000), size=100, replace=False)

mae=[]
rmse=[]
r2=[]
for rs in random_states_list:
    rnd=ensemble.RandomForestRegressor(n_estimators = 74, max_depth= 199, random_state=rs)
    
    rnd.fit(cv_input, scaled_cv_label.flatten())
    rnd_predict=rnd.predict(test_input)
    rnd_mae= metrics.mean_absolute_error(test_label, scaler.inverse_transform(rnd_predict.reshape(-1,1)))
    rnd_rmse= np.sqrt(metrics.mean_squared_error(test_label, scaler.inverse_transform(rnd_predict.reshape(-1,1))))
    rnd_r2= metrics.r2_score(test_label, scaler.inverse_transform(rnd_predict.reshape(-1,1)))
    mae.append(rnd_mae)
    rmse.append(rnd_rmse)
    r2.append(rnd_r2)
rnd_mae=np.mean(mae)
rnd_rmse=np.mean(rmse)
rnd_r2=np.mean(r2)

print(f'Abs_RF_MAE : {rnd_mae}\nAbs_RF_RMSE : {rnd_rmse}\nAbs_RF_R2 :{rnd_r2}' )