In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from utils import load_features_and_labels, plot_pca
from models.random_forest import train_rf_model
from models.gaussian_process import train_gp_model
from lib import generate_smiles

from rdkit.Chem import MolFromSmiles
from mordred import Calculator, descriptors, error

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes

### Generating a library of photoswitches

In [None]:
parent_mols = [
  '[U](C1=CC=C(C=C1)N=NC2=CC=C([U])C=C2)' # azobenzene
  'C2(=CC=C([U])[N]2)N=NC3=CC=C([U])[N]3' # bisazopyrrole
  'C2(=CC=C([U])[S]2)N=NC3=CC=C([U])[S]3' # bisazothiophene
  'C2(=CC=C([U])[O]2)N=NC3=CC=C([U])[O]3' # bisazofuran
]

linkers = [
  '([H])',                                # H-terminus
  '(c2ccc([Y])cc2)',                      # benzene
  '(c2ncc([Y])cc2)',                      # pyridine
  '(c2ncc([Y])cn2)',                      # pyrimidine
  '(c2nnc([Y])nn2)',                      # tetrazine
  'C2=CC=C([Y])C2',                       # cyclopentadiene
  '(c2ccc([Y])N2)',                       # pyrrole (2,5) 
  '(c2cc([Y])cN2)',                       # pyrrole (2,4) 
  '(c2ccc([Y])N(C)2)',                    # pyrrole(N-methyl) 
  '(c2ccc([Y])N(C=O)2)',                  # pyrrole(N-COH)
  '(c1cnc([Y])N1)',                       # imidazole
  'c2ccc([Y])O2',                         # furan
  'c2ccc([Y])S2',                         # thiophene
  '(c2ccc([Y])S(=O)(=O)2)',               # thiophene(dioxide)
  '(c2sc([Y])cn2)',                       # thiazole (2,5)
  '(c2scc([Y])n2)',                       # thiazole (2,4)
  '(c1ncc([Y])o1)',                       # oxazole (2,5)
  '(c1nc([Y])co1)',                       # oxazole (2,4)
  '(C#C[Y])',                             # acetylene
  '/C=C/[Y]',                             # ethylene(trans)
  '(C=N[Y])'                              # imine
] 

terminals = [
  '([H])',                                # hydrogen
  '([OH])',                               # hydroxy 
  '[C](F)(F)F',                           # trifluoromethyl 
  '[O][C](F)(F)F',                        # trifluoromethoxy 
  '[C]',                                  # methyl
  '[O][C]',                               # methoxy 
  '[N+]([O-])=O',                         # nitro 
  '([SH])',                               # thiol 
  '[F]',                                  # fluoro 
  '[Cl]',                                 # chloro 
  'C#N'                                   # cyano 
]

generate_smiles('library_01.txt', parent_mols, linkers, terminals)

In [2]:
smiles, X, X_p, y = load_features_and_labels('./processed_data/mordred_descriptors.csv' ,'./raw_data/photoswitches.csv' ,'e_iso_pi')
model, x_scaler, y_scaler = train_gp_model(X, y)


Beginning training loop...
(313, 1427) (313, 1)
(313, 1427) (313, 1)
(313, 1427) (313, 1)
(313, 1427) (313, 1)
(313, 1427) (313, 1)
(313, 1427) (313, 1)
(313, 1427) (313, 1)
(313, 1427) (313, 1)
(313, 1427) (313, 1)
(313, 1427) (313, 1)

mean R^2: 0.9045 +- 0.0080
mean RMSE: 20.1811 +- 0.8257
mean MAE: 12.8562 +- 0.4907



In [9]:
smiles_list = []
with open('library_01.txt', 'r') as f:
  lines = f.readlines()
  for line in lines:
    smiles_list.append(line.rstrip())

rdkit_mols = [MolFromSmiles(smiles) for smiles in smiles_list]

calc = Calculator(descriptors)

mordred_descriptors = calc.pandas(rdkit_mols)

error_columns = []
for i, e in enumerate(mordred_descriptors.dtypes):
    if e=="object":
        error_columns += [i]
        
mordred_descriptors = mordred_descriptors.drop(mordred_descriptors.columns[error_columns], axis=1)
mordred_descriptors = mordred_descriptors.dropna(axis=1)
mordred_descriptors.insert(0, "SMILES", smiles_list)
mordred_descriptors.to_csv('./mordred_descriptors_library_01.csv')

100%|██████████| 872/872 [02:09<00:00,  6.76it/s]


In [10]:
df1 = pd.read_csv('./processed_data/mordred_descriptors.csv')
original_columns = df1.columns[2:]

df2 = pd.read_csv('./mordred_descriptors_library_01.csv')
remove_columns = []
for column in df2.columns[2:]:
  if column not in original_columns:
    print(column)
    remove_columns.append(column)
    
df2 = df2.drop(labels=remove_columns, axis=1)

df2.to_csv('./mordred_descriptors_library_01.csv')

MAXaaCH
MINaaCH
MDEC-22
MDEC-23
Vabc


In [11]:
mordred_descriptors = pd.read_csv('mordred_descriptors_library_01.csv')

In [34]:
means = []
vars = []
scaled_vars = []

for idx, row in mordred_descriptors.iterrows():
  features = row.to_numpy()[4:].reshape(1, -1)
  features = x_scaler.transform(features)
  mean, var = model.predict_f(features)
  means.append(y_scaler.inverse_transform(mean)[0][0])
  vars.append(var[0][0].numpy())
  scaled_vars.append(y_scaler.inverse_transform(var)[0][0])
  
# mordred_descriptors.insert(loc=0, column='mean', value=means)
# mordred_descriptors.insert(loc=0, column='var', value=vars)

ValueError: cannot insert mean, already exists

In [28]:
print(len(means), len(vars))
print(len(mordred_descriptors))

872 872
872


In [None]:
desired_wavelength = 450
df_sort = mordred_descriptors.iloc[(mordred_descriptors['mean']-desired_wavelength).abs().argsort()[:2]]