In [1]:
!pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [11]:
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [10]:
# upload manually with button "Choose Files"

df = pd.read_excel('HOMO-LUMO-energies.xlsx')
smiles_list = df["Smiles"]


# Morgan

In [4]:
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]
len(mols)

63

In [13]:
homo_values = df["HOMO energy (eV)"]
lumo_values = df["LUMO energy (eV)"]
dff_values = df["dFF"]

# Convert the RDKit fingerprint objects to NumPy arrays
features = np.column_stack((fingerprints, homo_values, lumo_values))

In [14]:
with open('out-SVR-Morgan-rbf.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='rbf')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

In [15]:
with open('out-SVR-Morgan-linear.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='linear')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

In [16]:
with open('out-SVR-Morgan-sigmoid.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='sigmoid')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

# MACCS

In [19]:
#MACCS
from rdkit.Chem import MACCSkeys

mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
Maccs_fingerprints = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]

In [21]:
features = np.column_stack((Maccs_fingerprints, homo_values, lumo_values))

with open('out-SVR-rbf-MACCS.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='rbf')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)


In [22]:
features = np.column_stack((Maccs_fingerprints, homo_values, lumo_values))

with open('out-SVR-sigmoid-MACCS.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='sigmoid')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

In [23]:
features = np.column_stack((Maccs_fingerprints, homo_values, lumo_values))

with open('out-SVR-linear-MACCS.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='linear')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

# Avalon

In [24]:
pip install rdkit-pypi avalon_framework

Collecting avalon_framework
  Downloading avalon_framework-1.8.2.tar.gz (3.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: avalon_framework
  Building wheel for avalon_framework (setup.py) ... [?25l[?25hdone
  Created wheel for avalon_framework: filename=avalon_framework-1.8.2-py3-none-any.whl size=3863 sha256=17a9981637f012e50844decf1e60eb01d095a1c0974c489c60079d0bb8dacfb7
  Stored in directory: /root/.cache/pip/wheels/78/3f/5c/a65bfa8ce94f62739865cf30e5687272ee719961b4311d05e3
Successfully built avalon_framework
Installing collected packages: avalon_framework
Successfully installed avalon_framework-1.8.2


In [25]:
from rdkit import Chem
from rdkit.Avalon import pyAvalonTools

In [26]:
Avalon_fingerprints = [pyAvalonTools.GetAvalonFP(mol) for mol in mols]

In [27]:
features = np.column_stack((Avalon_fingerprints, homo_values, lumo_values))

with open('out-SVR-rbf-Avalon.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='rbf')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

In [28]:
features = np.column_stack((Avalon_fingerprints, homo_values, lumo_values))

with open('out-SVR-sigmoid-Avalon.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='sigmoid')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

In [29]:
features = np.column_stack((Avalon_fingerprints, homo_values, lumo_values))

with open('out-SVR-linear-Avalon.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='linear')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)
    #r2_array = np.array(r2_score)
    #print(r2_array, file=f)
  #print(r2_score_value)

    print(r2_score_value, file=f)

# Daylight

In [31]:
from rdkit import Chem
from rdkit.Chem import DataStructs
Daylight_fingerprints = [Chem.RDKFingerprint(mol) for mol in mols]

In [32]:
features = np.column_stack((Daylight_fingerprints, homo_values, lumo_values))

with open('out-SVR-rbf-Daylight.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='rbf')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

In [33]:
features = np.column_stack((Daylight_fingerprints, homo_values, lumo_values))

with open('out-SVR-linear-Daylight.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='linear')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

In [34]:
features = np.column_stack((Daylight_fingerprints, homo_values, lumo_values))

with open('out-SVR-sigmoid-Daylight.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='sigmoid')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

# AtomPairs

In [39]:
from rdkit.Chem import rdMolDescriptors

mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
atom_pairs_fingerprints = [rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol) for mol in mols]

In [36]:
features = np.column_stack((atom_pairs_fingerprints, homo_values, lumo_values))

with open('out-SVR-sigmoid-atompairs.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='sigmoid')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

In [37]:
features = np.column_stack((atom_pairs_fingerprints, homo_values, lumo_values))

with open('out-SVR-rbf-atompairs.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='rbf')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

In [38]:
features = np.column_stack((atom_pairs_fingerprints, homo_values, lumo_values))

with open('out-SVR-linear-atompairs.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='linear')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

# Topological Torsion

In [40]:
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
torsion_fingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol) for mol in mols]

In [41]:
features = np.column_stack((torsion_fingerprints, homo_values, lumo_values))

with open('out-SVR-linear-torsion.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='linear')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

In [42]:
features = np.column_stack((torsion_fingerprints, homo_values, lumo_values))

with open('out-SVR-rbf-torsion.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='rbf')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)

In [43]:
features = np.column_stack((torsion_fingerprints, homo_values, lumo_values))

with open('out-SVR-sigmoid-torsion.txt', 'w') as f:
  for i in np.arange(0,200):
    #splitting dataset into train and test data
    x_train, x_test, y_train, y_test = train_test_split(features, dff_values, test_size=0.20, random_state = i)
    regressor = SVR(kernel='sigmoid')
    regressor.fit(x_train, y_train)
  # Evaluate the model on the testing dataset
    y_pred = regressor.predict(x_test)
    mean_squared_error(y_test, y_pred)
    r2_score_value = r2_score(y_test, y_pred)

    print(r2_score_value, file=f)