In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from rdkit.Chem import AllChem, Descriptors, MolFromSmiles
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [2]:
# Import main data and get list of SMILES
photoswitches = pd.read_csv('./photoswitches.csv')  # Load the photoswitch dataset using pandas
smiles_list = photoswitches['SMILES'].to_list()

In [3]:
len(smiles_list)

405

In [4]:
# Initiate list of rdkit molecules
rdkit_mols = [MolFromSmiles(smiles) for smiles in smiles_list]

In [5]:
# Get Morgan fingerprints, note the parameters!
morgan_fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=2048) for mol in rdkit_mols]
morgan_fingerprints = np.asarray(morgan_fingerprints)

In [6]:
# Turn into pandas dataframe and add smiles as a first column
morgan_fingerprints = pd.DataFrame(data = morgan_fingerprints)
morgan_fingerprints.insert(0, "SMILES", smiles_list)

In [7]:
morgan_fingerprints

Unnamed: 0,SMILES,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,C[N]1N=NC(=N1)N=NC2=CC=CC=C2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,C[N]1C=NC(=N1)N=NC2=CC=CC=C2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,C[N]1C=CC(=N1)N=NC2=CC=CC=C2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,C[N]1C=C(C=N1)N=NC2=CC=CC=C2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,OC%38=C%39N=CC=CC%39=C(/N=N/C%40=NC%41=CC(C)=C...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,OC%42=C%43N=CC=CC%43=C(/N=N/C%44=NC%45=CC=CC=C...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,N#CC1C(SC(/N=N/C2=NC(C=CC([N+]([O-])=O)=C3)=C3...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
403,N#Cc5c(c6ccc(Cl)cc6)c(/N=N/C7=NC(C=CC([N+]([O-...,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [8]:
morgan_fingerprints.to_csv("morgan_fingerprints.csv")

In [9]:
# Next, rdkit's own descriptors
from rdkit.Chem import Descriptors

In [10]:
# A list of desriptors
Descriptors.descList

[('MaxEStateIndex',
  <function rdkit.Chem.EState.EState.MaxEStateIndex(mol, force=1)>),
 ('MinEStateIndex',
  <function rdkit.Chem.EState.EState.MinEStateIndex(mol, force=1)>),
 ('MaxAbsEStateIndex',
  <function rdkit.Chem.EState.EState.MaxAbsEStateIndex(mol, force=1)>),
 ('MinAbsEStateIndex',
  <function rdkit.Chem.EState.EState.MinAbsEStateIndex(mol, force=1)>),
 ('qed',
  <function rdkit.Chem.QED.qed(mol, w=QEDproperties(MW=0.66, ALOGP=0.46, HBA=0.05, HBD=0.61, PSA=0.06, ROTB=0.65, AROM=0.48, ALERTS=0.95), qedProperties=None)>),
 ('MolWt', <function rdkit.Chem.Descriptors.<lambda>(*x, **y)>),
 ('HeavyAtomMolWt', <function rdkit.Chem.Descriptors.HeavyAtomMolWt(x)>),
 ('ExactMolWt', <function rdkit.Chem.Descriptors.<lambda>(*x, **y)>),
 ('NumValenceElectrons',
  <function rdkit.Chem.Descriptors.NumValenceElectrons(mol)>),
 ('NumRadicalElectrons',
  <function rdkit.Chem.Descriptors.NumRadicalElectrons(mol)>),
 ('MaxPartialCharge',
  <function rdkit.Chem.Descriptors.MaxPartialCharge(mo

In [11]:
# Write a dictionary of name:function pairs for all descriptors
all_descriptors = {d[0]: d[1] for d in Descriptors.descList}

In [12]:
# Initialise a new pandas df
rdkit_descriptors = pd.DataFrame(data = {"SMILES": np.array((smiles_list)) })
rdkit_descriptors

Unnamed: 0,SMILES
0,C[N]1N=NC(=N1)N=NC2=CC=CC=C2
1,C[N]1C=NC(=N1)N=NC2=CC=CC=C2
2,C[N]1C=CC(=N1)N=NC2=CC=CC=C2
3,C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2
4,C[N]1C=C(C=N1)N=NC2=CC=CC=C2
...,...
400,OC%38=C%39N=CC=CC%39=C(/N=N/C%40=NC%41=CC(C)=C...
401,OC%42=C%43N=CC=CC%43=C(/N=N/C%44=NC%45=CC=CC=C...
402,N#CC1C(SC(/N=N/C2=NC(C=CC([N+]([O-])=O)=C3)=C3...
403,N#Cc5c(c6ccc(Cl)cc6)c(/N=N/C7=NC(C=CC([N+]([O-...


In [13]:
# Compute each descriptor (outer loop) for each molecule(inside)
for feature in all_descriptors:
    values = []
    for mol in rdkit_mols:
        values += [all_descriptors[feature](mol)]
    rdkit_descriptors[feature] = values

rdkit_descriptors

Unnamed: 0,SMILES,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,C[N]1N=NC(=N1)N=NC2=CC=CC=C2,3.936481,0.260512,3.936481,0.260512,0.672275,188.194,180.130,188.081044,70,...,0,0,0,0,1,0,0,0,0,0
1,C[N]1C=NC(=N1)N=NC2=CC=CC=C2,3.976481,0.371623,3.976481,0.371623,0.676690,187.206,178.134,187.085795,70,...,0,0,0,0,0,0,0,0,0,0
2,C[N]1C=CC(=N1)N=NC2=CC=CC=C2,4.081997,0.621623,4.081997,0.621623,0.664734,186.218,176.138,186.090546,70,...,0,0,0,0,0,0,0,0,0,0
3,C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2,4.181534,0.667920,4.181534,0.667920,0.686454,200.245,188.149,200.106196,76,...,0,0,0,0,0,0,0,0,0,0
4,C[N]1C=C(C=N1)N=NC2=CC=CC=C2,4.061481,0.760512,4.061481,0.760512,0.664734,186.218,176.138,186.090546,70,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,OC%38=C%39N=CC=CC%39=C(/N=N/C%40=NC%41=CC(C)=C...,9.883085,0.134711,9.883085,0.134711,0.486213,334.404,320.292,334.088832,118,...,0,0,0,0,0,1,0,0,0,0
401,OC%42=C%43N=CC=CC%43=C(/N=N/C%44=NC%45=CC=CC=C...,9.822888,0.123414,9.822888,0.123414,0.541174,289.298,278.210,289.096360,106,...,0,0,0,0,0,0,0,0,0,0
402,N#CC1C(SC(/N=N/C2=NC(C=CC([N+]([O-])=O)=C3)=C3...,10.934342,-0.422909,10.934342,0.033505,0.424569,416.532,396.372,416.108916,146,...,1,0,0,0,0,1,0,0,0,0
403,N#Cc5c(c6ccc(Cl)cc6)c(/N=N/C7=NC(C=CC([N+]([O-...,10.920834,-0.461717,10.920834,0.014485,0.220871,440.897,431.825,439.991693,142,...,0,0,0,0,0,1,0,1,0,0


In [14]:
rdkit_descriptors.to_csv("rdkit_descriptors.csv")

In [15]:
# Finally, mordred descriptors
from mordred import Calculator, descriptors, error

In [16]:
# Initialise a calculator -- mordred works weirdly this way...
calc = Calculator(descriptors)

In [17]:
# Wow, many descriptors, much wow
len(calc.descriptors)

1826

In [18]:
mordred_descriptors = calc.pandas(rdkit_mols)

 16%|█▌        | 63/405 [00:06<01:03,  5.39it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 405/405 [00:27<00:00, 14.80it/s]


In [19]:
# It seems that unfortunately some descriptors cannot be computed. To filter this, 
# we find all columns that are of data type "object", since those contain non-numerical values usually.
error_columns = []
for i, e in enumerate(mordred_descriptors.dtypes):
    if e=="object":
        error_columns += [i]
error_columns

[780,
 781,
 782,
 783,
 784,
 785,
 786,
 787,
 788,
 789,
 790,
 791,
 792,
 793,
 794,
 795,
 796,
 797,
 798,
 799,
 800,
 801,
 802,
 803,
 804,
 805,
 806,
 807,
 808,
 809,
 810,
 811,
 812,
 813,
 814,
 817,
 818,
 819,
 820,
 821,
 822,
 906,
 907,
 908,
 909,
 910,
 911,
 912,
 913,
 914,
 915,
 916,
 917,
 918,
 919,
 1090,
 1091,
 1092,
 1093,
 1094,
 1095,
 1096,
 1097,
 1098,
 1099,
 1100,
 1101,
 1102,
 1103,
 1104,
 1105,
 1107,
 1108,
 1109,
 1110,
 1111,
 1112,
 1113,
 1114,
 1115,
 1116,
 1118,
 1119,
 1120,
 1121,
 1122,
 1123,
 1124,
 1125,
 1126,
 1127,
 1128,
 1129,
 1130,
 1131,
 1132,
 1133,
 1134,
 1135,
 1136,
 1137,
 1138,
 1139,
 1140,
 1141,
 1142,
 1143,
 1144,
 1145,
 1146,
 1147,
 1148,
 1149,
 1150,
 1151,
 1152,
 1153,
 1154,
 1155,
 1156,
 1157,
 1158,
 1159,
 1160,
 1161,
 1162,
 1163,
 1164,
 1165,
 1166,
 1167,
 1168,
 1169,
 1170,
 1171,
 1172,
 1173,
 1174,
 1175,
 1176,
 1177,
 1178,
 1179,
 1180,
 1181,
 1182,
 1183,
 1184,
 1186,
 1187,
 1188

In [20]:
# use .drop to remove the affected columns 
mordred_descriptors = mordred_descriptors.drop(mordred_descriptors.columns[error_columns], axis=1)

In [21]:
# and remove columns containing NA data, but I don't think this actually does anything...
mordred_descriptors = mordred_descriptors.dropna(axis=1)

In [22]:
# again, insert first SMILES column
mordred_descriptors.insert(0, "SMILES", smiles_list)
mordred_descriptors

Unnamed: 0,SMILES,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,C[N]1N=NC(=N1)N=NC2=CC=CC=C2,10.715992,9.304580,2,0,18.370637,2.259932,4.435888,18.370637,1.312188,...,9.081484,58.361026,188.081044,8.549138,342,14,68.0,75.0,3.833333,3.166667
1,C[N]1C=NC(=N1)N=NC2=CC=CC=C2,10.715992,9.304580,0,0,18.370637,2.259932,4.435888,18.370637,1.312188,...,9.081484,58.361026,187.085795,8.134165,342,14,68.0,75.0,3.833333,3.166667
2,C[N]1C=CC(=N1)N=NC2=CC=CC=C2,10.715992,9.304580,0,0,18.370637,2.259932,4.435888,18.370637,1.312188,...,9.081484,58.361026,186.090546,7.753773,342,14,68.0,75.0,3.833333,3.166667
3,C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2,11.492048,10.070849,0,0,19.241292,2.321086,4.511537,19.241292,1.282753,...,9.271718,60.342987,200.106196,7.411341,403,17,74.0,83.0,4.694444,3.361111
4,C[N]1C=C(C=N1)N=NC2=CC=CC=C2,10.715992,9.304580,0,0,18.370637,2.259932,4.435888,18.370637,1.312188,...,9.081484,58.361026,186.090546,7.753773,342,14,68.0,75.0,3.833333,3.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,OC%38=C%39N=CC=CC%39=C(/N=N/C%40=NC%41=CC(C)=C...,19.217852,14.577842,0,0,31.530229,2.440229,4.850263,31.530229,1.313760,...,10.207473,72.933632,334.088832,8.791811,1425,39,132.0,158.0,7.000000,5.138889
401,OC%42=C%43N=CC=CC%43=C(/N=N/C%44=NC%45=CC=CC=C...,17.625299,13.419762,0,0,29.700756,2.423279,4.833309,29.700756,1.350034,...,10.069891,70.381330,289.096360,8.760496,1112,34,120.0,143.0,5.277778,4.777778
402,N#CC1C(SC(/N=N/C2=NC(C=CC([N+]([O-])=O)=C3)=C3...,22.005839,17.491326,0,1,36.641554,2.468925,4.787066,36.641554,1.308627,...,10.259867,79.622966,416.108916,8.668936,2173,44,150.0,179.0,8.611111,6.166667
403,N#Cc5c(c6ccc(Cl)cc6)c(/N=N/C7=NC(C=CC([N+]([O-...,22.822336,17.990446,0,0,37.442910,2.472194,4.805734,37.442910,1.291135,...,10.306450,80.832586,439.991693,11.578729,2402,46,156.0,186.0,9.472222,6.333333


In [23]:
mordred_descriptors.to_csv("mordred_descriptors.csv")

In [24]:
# finally, generate images of molecules
from rdkit.Chem import Draw
for i,mol in enumerate(rdkit_mols):
    Draw.MolToFile(mol, filename = "molecule_images/"+ str(i) + ".png")