In [1]:
!pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [2]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.svm import SVR
import numpy as np

In [5]:
# Read the input Excel file for model training
df = pd.read_excel('HOMO-LUMO-energies.xlsx')
smiles_list = df["Smiles"]
homo_energies = df["HOMO energy (eV)"]
lumo_energies = df["LUMO energy (eV)"]
dff_values = df["dFF"]

# Combine fingerprints, HOMO, and LUMO energies into a single input array
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
torsion_fingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol) for mol in mols]
fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(torsion_fingerprints, homo_energies, lumo_energies)]

#Read input file for blind test dataset
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_homo_energies = df_new["HOMO (eV)"]
new_lumo_energies = df_new["LUMO (eV)"]

# Combine fingerprints, HOMO, and LUMO energies for new molecules into a single input array
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_torsion_fingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol) for mol in new_mols]
new_fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(new_torsion_fingerprints, new_homo_energies, new_lumo_energies)]

# Best models from training
best_models = []  # List to store the best models
num_best_models = 5  # Number of best models to save
best_r2_scores = [-float('inf')] * num_best_models

with open('out-SVR-linear-torsion.txt', 'w') as f:
    for i in np.arange(0, 200):
        # Splitting dataset into train and test data
        x_train, x_test, y_train, y_test = train_test_split(fingerprints_array, dff_values, test_size=0.20, random_state=i)
        regressor = SVR(kernel='linear')
        regressor.fit(x_train, y_train)

        # Evaluate the model on the testing dataset
        y_pred = regressor.predict(x_test)
        r2_score_value = r2_score(y_test, y_pred)

        # Save the best models
        for j, best_r2 in enumerate(best_r2_scores):
            if r2_score_value > best_r2:
                best_models.insert(j, regressor)
                best_r2_scores.insert(j, r2_score_value)
                del best_models[num_best_models:]  # Keep only the top 5 models
                del best_r2_scores[num_best_models:]  # Keep only the top 5 scores
                break

        print(f"Iteration {i + 1}: R2 Score = {r2_score_value}", file=f)

# Now 'best_models' contains the top 5 models with the highest R-squared scores.

# Use the best models to make predictions on new molecules
new_predictions_best = [model.predict(new_fingerprints_array) for model in best_models]

# Create DataFrames with molecule names and their corresponding predictions for each best model
result_dfs = []
for idx, predictions in enumerate(new_predictions_best):
    result_df = pd.DataFrame(predictions, columns=[f'Model_{idx + 1}_Prediction'])
    result_dfs.append(result_df)

# Concatenate DataFrames along rows
result_df = pd.concat(result_dfs, axis=1)

# Transpose the DataFrame
result_df_transposed = result_df.T

# Write the transposed DataFrame to a CSV file
result_df_transposed.to_csv('predicted_dff-lin-tor-en.csv', header=False)

# Save the R-squared values of the best models
with open('r2_values_best-lin-tor-en.txt', 'w') as f:
    for j, best_r2 in enumerate(best_r2_scores):
        f.write(f"Best Model {j + 1} R2: {best_r2}\n")


# Morgan linear

In [6]:
# Read the input Excel file
df = pd.read_excel('HOMO-LUMO-energies.xlsx')
smiles_list = df["Smiles"]
homo_energies = df["HOMO energy (eV)"]
lumo_energies = df["LUMO energy (eV)"]
dff_values = df["dFF"]

# Generate Morgan fingerprints, HOMO, and LUMO energies into a single input array
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
morgan_fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]
fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(morgan_fingerprints, homo_energies, lumo_energies)]

# Read the input CSV file for nblind test dataset
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_homo_energies = df_new["HOMO (eV)"]
new_lumo_energies = df_new["LUMO (eV)"]

# Generate Morgan fingerprints, HOMO, and LUMO energies for new molecules into a single input array
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_morgan_fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in new_mols]
new_fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(new_morgan_fingerprints, new_homo_energies, new_lumo_energies)]

# Best models from training
best_models = []  # List to store the best models
num_best_models = 5  # Number of best models to save
best_r2_scores = [-float('inf')] * num_best_models

with open('out-SVR-linear-morgan.txt', 'w') as f:
    for i in np.arange(0, 200):
        # Splitting dataset into train and test data
        x_train, x_test, y_train, y_test = train_test_split(fingerprints_array, dff_values, test_size=0.20, random_state=i)
        regressor = SVR(kernel='linear')
        regressor.fit(x_train, y_train)

        # Evaluate the model on the testing dataset
        y_pred = regressor.predict(x_test)
        r2_score_value = r2_score(y_test, y_pred)

        # Save the best models
        for j, best_r2 in enumerate(best_r2_scores):
            if r2_score_value > best_r2:
                best_models.insert(j, regressor)
                best_r2_scores.insert(j, r2_score_value)
                del best_models[num_best_models:]  # Keep only the top 5 models
                del best_r2_scores[num_best_models:]  # Keep only the top 5 scores
                break

        print(f"Iteration {i + 1}: R2 Score = {r2_score_value}", file=f)

# Now 'best_models' contains the top 5 models with the highest R-squared scores.

# Use the best models to make predictions on new molecules
new_predictions_best = [model.predict(new_fingerprints_array) for model in best_models]

# Create DataFrames with molecule names and their corresponding predictions for each best model
result_dfs = []
for idx, predictions in enumerate(new_predictions_best):
    result_df = pd.DataFrame(predictions, columns=[f'Model_{idx + 1}_Prediction'])
    result_dfs.append(result_df)

# Concatenate DataFrames along rows
result_df = pd.concat(result_dfs, axis=1)

# Transpose the DataFrame
result_df_transposed = result_df.T

# Write the transposed DataFrame to a CSV file
result_df_transposed.to_csv('predicted_dff-lin-morgan-en.csv', header=False)

# Save the R-squared values of the best models
with open('r2_values_best-lin-morgan-en.txt', 'w') as f:
    for j, best_r2 in enumerate(best_r2_scores):
        f.write(f"Best Model {j + 1} R2: {best_r2}\n")


# Atompairs-linear

In [7]:
# Read the input Excel file
df = pd.read_excel('HOMO-LUMO-energies.xlsx')
smiles_list = df["Smiles"]
homo_energies = df["HOMO energy (eV)"]
lumo_energies = df["LUMO energy (eV)"]
dff_values = df["dFF"]

# Generate AtomPairs fingerprints, HOMO, and LUMO energies into a single input array
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
atompairs_fingerprints = [AllChem.GetHashedAtomPairFingerprintAsBitVect(mol) for mol in mols]
fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(atompairs_fingerprints, homo_energies, lumo_energies)]

# Read the input CSV file for blind test dataset
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_homo_energies = df_new["HOMO (eV)"]
new_lumo_energies = df_new["LUMO (eV)"]

# Generate AtomPairs fingerprints, HOMO, and LUMO energies for new molecules into a single input array
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_atompairs_fingerprints = [AllChem.GetHashedAtomPairFingerprintAsBitVect(mol) for mol in new_mols]
new_fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(new_atompairs_fingerprints, new_homo_energies, new_lumo_energies)]

# Best models from training
best_models = []  # List to store the best models
num_best_models = 5  # Number of best models to save
best_r2_scores = [-float('inf')] * num_best_models

with open('out-SVR-linear-atompairs.txt', 'w') as f:
    for i in np.arange(0, 200):
        # Splitting dataset into train and test data
        x_train, x_test, y_train, y_test = train_test_split(fingerprints_array, dff_values, test_size=0.20, random_state=i)
        regressor = SVR(kernel='linear')
        regressor.fit(x_train, y_train)

        # Evaluate the model on the testing dataset
        y_pred = regressor.predict(x_test)
        r2_score_value = r2_score(y_test, y_pred)

        # Save the best models
        for j, best_r2 in enumerate(best_r2_scores):
            if r2_score_value > best_r2:
                best_models.insert(j, regressor)
                best_r2_scores.insert(j, r2_score_value)
                del best_models[num_best_models:]  # Keep only the top 5 models
                del best_r2_scores[num_best_models:]  # Keep only the top 5 scores
                break

        print(f"Iteration {i + 1}: R2 Score = {r2_score_value}", file=f)

# Now 'best_models' contains the top 5 models with the highest R-squared scores.

# Use the best models to make predictions on new molecules
new_predictions_best = [model.predict(new_fingerprints_array) for model in best_models]

# Create DataFrames with molecule names and their corresponding predictions for each best model
result_dfs = []
for idx, predictions in enumerate(new_predictions_best):
    result_df = pd.DataFrame(predictions, columns=[f'Model_{idx + 1}_Prediction'])
    result_dfs.append(result_df)

# Concatenate DataFrames along rows
result_df = pd.concat(result_dfs, axis=1)

# Transpose the DataFrame
result_df_transposed = result_df.T

# Write the transposed DataFrame to a CSV file
result_df_transposed.to_csv('predicted_dff-lin-atompairs-en.csv', header=False)

# Save the R-squared values of the best models
with open('r2_values_best-lin-atompairs-en.txt', 'w') as f:
    for j, best_r2 in enumerate(best_r2_scores):
        f.write(f"Best Model {j + 1} R2: {best_r2}\n")


# Avalon linear

In [8]:
from rdkit import Chem
from rdkit.Avalon import pyAvalonTools

# Read the input Excel file
df = pd.read_excel('HOMO-LUMO-energies.xlsx')
smiles_list = df["Smiles"]
homo_energies = df["HOMO energy (eV)"]
lumo_energies = df["LUMO energy (eV)"]
dff_values = df["dFF"]

# Generate Avalon fingerprints, HOMO, and LUMO energies into a single input array
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
avalon_fingerprints = [pyAvalonTools.GetAvalonFP(mol) for mol in mols]
fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(avalon_fingerprints, homo_energies, lumo_energies)]

# Read the input CSV file for  blind test dataset
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_homo_energies = df_new["HOMO (eV)"]
new_lumo_energies = df_new["LUMO (eV)"]

# Generate Avalon fingerprints, HOMO, and LUMO energies for new molecules into a single input array
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_avalon_fingerprints = [pyAvalonTools.GetAvalonFP(mol) for mol in new_mols]
new_fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(new_avalon_fingerprints, new_homo_energies, new_lumo_energies)]

# Best models from training
best_models = []  # List to store the best models
num_best_models = 5  # Number of best models to save
best_r2_scores = [-float('inf')] * num_best_models

with open('out-SVR-linear-avalon.txt', 'w') as f:
    for i in np.arange(0, 200):
        # Splitting dataset into train and test data
        x_train, x_test, y_train, y_test = train_test_split(fingerprints_array, dff_values, test_size=0.20, random_state=i)
        regressor = SVR(kernel='linear')
        regressor.fit(x_train, y_train)

        # Evaluate the model on the testing dataset
        y_pred = regressor.predict(x_test)
        r2_score_value = r2_score(y_test, y_pred)

        # Save the best models
        for j, best_r2 in enumerate(best_r2_scores):
            if r2_score_value > best_r2:
                best_models.insert(j, regressor)
                best_r2_scores.insert(j, r2_score_value)
                del best_models[num_best_models:]  # Keep only the top 5 models
                del best_r2_scores[num_best_models:]  # Keep only the top 5 scores
                break

        print(f"Iteration {i + 1}: R2 Score = {r2_score_value}", file=f)

# Now 'best_models' contains the top 5 models with the highest R-squared scores.

# Use the best models to make predictions on new molecules
new_predictions_best = [model.predict(new_fingerprints_array) for model in best_models]

# Create DataFrames with molecule names and their corresponding predictions for each best model
result_dfs = []
for idx, predictions in enumerate(new_predictions_best):
    result_df = pd.DataFrame(predictions, columns=[f'Model_{idx + 1}_Prediction'])
    result_dfs.append(result_df)

# Concatenate DataFrames along rows
result_df = pd.concat(result_dfs, axis=1)

# Transpose the DataFrame
result_df_transposed = result_df.T

# Write the transposed DataFrame to a CSV file
result_df_transposed.to_csv('predicted_dff-lin-avalon-en.csv', header=False)

# Save the R-squared values of the best models
with open('r2_values_best-lin-avalon-en.txt', 'w') as f:
    for j, best_r2 in enumerate(best_r2_scores):
        f.write(f"Best Model {j + 1} R2: {best_r2}\n")


# Maccs rbf

In [9]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import r2_score

# Read the input Excel file
df = pd.read_excel('HOMO-LUMO-energies.xlsx')
smiles_list = df["Smiles"]
homo_energies = df["HOMO energy (eV)"]
lumo_energies = df["LUMO energy (eV)"]
dff_values = df["dFF"]

# Generate MACCS fingerprints, HOMO, and LUMO energies into a single input array
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
maccs_fingerprints = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]
fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(maccs_fingerprints, homo_energies, lumo_energies)]

# Read the input CSV file for blind test dataset
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_homo_energies = df_new["HOMO (eV)"]
new_lumo_energies = df_new["LUMO (eV)"]

# Generate MACCS fingerprints, HOMO, and LUMO energies for new molecules into a single input array
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_maccs_fingerprints = [MACCSkeys.GenMACCSKeys(mol) for mol in new_mols]
new_fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(new_maccs_fingerprints, new_homo_energies, new_lumo_energies)]

# Best models from training
best_models = []  # List to store the best models
num_best_models = 5  # Number of best models to save
best_r2_scores = [-float('inf')] * num_best_models

with open('out-SVR-rbf-maccs.txt', 'w') as f:
    for i in np.arange(0, 200):
        # Splitting dataset into train and test data
        x_train, x_test, y_train, y_test = train_test_split(fingerprints_array, dff_values, test_size=0.20, random_state=i)
        regressor = SVR(kernel='rbf')
        regressor.fit(x_train, y_train)

        # Evaluate the model on the testing dataset
        y_pred = regressor.predict(x_test)
        r2_score_value = r2_score(y_test, y_pred)

        # Save the best models
        for j, best_r2 in enumerate(best_r2_scores):
            if r2_score_value > best_r2:
                best_models.insert(j, regressor)
                best_r2_scores.insert(j, r2_score_value)
                del best_models[num_best_models:]  # Keep only the top 5 models
                del best_r2_scores[num_best_models:]  # Keep only the top 5 scores
                break

        print(f"Iteration {i + 1}: R2 Score = {r2_score_value}", file=f)

# Now 'best_models' contains the top 5 models with the highest R-squared scores.

# Use the best models to make predictions on new molecules
new_predictions_best = [model.predict(new_fingerprints_array) for model in best_models]

# Create DataFrames with molecule names and their corresponding predictions for each best model
result_dfs = []
for idx, predictions in enumerate(new_predictions_best):
    result_df = pd.DataFrame(predictions, columns=[f'Model_{idx + 1}_Prediction'])
    result_dfs.append(result_df)

# Concatenate DataFrames along rows
result_df = pd.concat(result_dfs, axis=1)

# Transpose the DataFrame
result_df_transposed = result_df.T

# Write the transposed DataFrame to a CSV file
result_df_transposed.to_csv('predicted_dff-rbf-maccs-en.csv', header=False)

# Save the R-squared values of the best models
with open('r2_values_best-rbf-maccs-en.txt', 'w') as f:
    for j, best_r2 in enumerate(best_r2_scores):
        f.write(f"Best Model {j + 1} R2: {best_r2}\n")


# Torsion rbf

In [10]:
# Read the input Excel file
df = pd.read_excel('HOMO-LUMO-energies.xlsx')
smiles_list = df["Smiles"]
homo_energies = df["HOMO energy (eV)"]
lumo_energies = df["LUMO energy (eV)"]
dff_values = df["dFF"]

# Generate Torsion fingerprints, HOMO, and LUMO energies into a single input array
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
torsion_fingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol) for mol in mols]
fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(torsion_fingerprints, homo_energies, lumo_energies)]

# Read the input CSV file for blind test dataset
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_homo_energies = df_new["HOMO (eV)"]
new_lumo_energies = df_new["LUMO (eV)"]

# Generate Torsion fingerprints, HOMO, and LUMO energies for new molecules into a single input array
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_torsion_fingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol) for mol in new_mols]
new_fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(new_torsion_fingerprints, new_homo_energies, new_lumo_energies)]

# Best models from training
best_models = []  # List to store the best models
num_best_models = 5  # Number of best models to save
best_r2_scores = [-float('inf')] * num_best_models

with open('out-SVR-rbf-torsion.txt', 'w') as f:
    for i in np.arange(0, 200):
        # Splitting dataset into train and test data
        x_train, x_test, y_train, y_test = train_test_split(fingerprints_array, dff_values, test_size=0.20, random_state=i)
        regressor = SVR(kernel='rbf')
        regressor.fit(x_train, y_train)

        # Evaluate the model on the testing dataset
        y_pred = regressor.predict(x_test)
        r2_score_value = r2_score(y_test, y_pred)

        # Save the best models
        for j, best_r2 in enumerate(best_r2_scores):
            if r2_score_value > best_r2:
                best_models.insert(j, regressor)
                best_r2_scores.insert(j, r2_score_value)
                del best_models[num_best_models:]  # Keep only the top 5 models
                del best_r2_scores[num_best_models:]  # Keep only the top 5 scores
                break

        print(f"Iteration {i + 1}: R2 Score = {r2_score_value}", file=f)

# Now 'best_models' contains the top 5 models with the highest R-squared scores.

# Use the best models to make predictions on new molecules
new_predictions_best = [model.predict(new_fingerprints_array) for model in best_models]

# Create DataFrames with molecule names and their corresponding predictions for each best model
result_dfs = []
for idx, predictions in enumerate(new_predictions_best):
    result_df = pd.DataFrame(predictions, columns=[f'Model_{idx + 1}_Prediction'])
    result_dfs.append(result_df)

# Concatenate DataFrames along rows
result_df = pd.concat(result_dfs, axis=1)

# Transpose the DataFrame
result_df_transposed = result_df.T

# Write the transposed DataFrame to a CSV file
result_df_transposed.to_csv('predicted_dff-rbf-torsion-en.csv', header=False)

# Save the R-squared values of the best models
with open('r2_values_best-rbf-torsion-en.txt', 'w') as f:
    for j, best_r2 in enumerate(best_r2_scores):
        f.write(f"Best Model {j + 1} R2: {best_r2}\n")


# Morgan sigmoid

In [12]:
# Read the input Excel file
df = pd.read_excel('HOMO-LUMO-energies.xlsx')
smiles_list = df["Smiles"]
homo_energies = df["HOMO energy (eV)"]
lumo_energies = df["LUMO energy (eV)"]
dff_values = df["dFF"]

# Generate Morgan fingerprints, HOMO, and LUMO energies into a single input array
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
morgan_fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]
fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(morgan_fingerprints, homo_energies, lumo_energies)]

# Read the input CSV file for blind test dataset
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_homo_energies = df_new["HOMO (eV)"]
new_lumo_energies = df_new["LUMO (eV)"]

# Generate Morgan fingerprints, HOMO, and LUMO energies for new molecules into a single input array
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_morgan_fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in new_mols]
new_fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(new_morgan_fingerprints, new_homo_energies, new_lumo_energies)]

# Best models from training
best_models = []  # List to store the best models
num_best_models = 5  # Number of best models to save
best_r2_scores = [-float('inf')] * num_best_models

with open('out-SVR-sigmoid-morgan.txt', 'w') as f:
    for i in np.arange(0, 200):
        # Splitting dataset into train and test data
        x_train, x_test, y_train, y_test = train_test_split(fingerprints_array, dff_values, test_size=0.20, random_state=i)
        regressor = SVR(kernel='sigmoid')
        regressor.fit(x_train, y_train)

        # Evaluate the model on the testing dataset
        y_pred = regressor.predict(x_test)
        r2_score_value = r2_score(y_test, y_pred)

        # Save the best models
        for j, best_r2 in enumerate(best_r2_scores):
            if r2_score_value > best_r2:
                best_models.insert(j, regressor)
                best_r2_scores.insert(j, r2_score_value)
                del best_models[num_best_models:]  # Keep only the top 5 models
                del best_r2_scores[num_best_models:]  # Keep only the top 5 scores
                break

        print(f"Iteration {i + 1}: R2 Score = {r2_score_value}", file=f)

# Now 'best_models' contains the top 5 models with the highest R-squared scores.

# Use the best models to make predictions on new molecules
new_predictions_best = [model.predict(new_fingerprints_array) for model in best_models]

# Create DataFrames with molecule names and their corresponding predictions for each best model
result_dfs = []
for idx, predictions in enumerate(new_predictions_best):
    result_df = pd.DataFrame(predictions, columns=[f'Model_{idx + 1}_Prediction'])
    result_dfs.append(result_df)

# Concatenate DataFrames along rows
result_df = pd.concat(result_dfs, axis=1)

# Transpose the DataFrame
result_df_transposed = result_df.T

# Write the transposed DataFrame to a CSV file
result_df_transposed.to_csv('predicted_dff-sigmoid-morgan-en.csv', header=False)

# Save the R-squared values of the best models
with open('r2_values_best-sigmoid-morgan-en.txt', 'w') as f:
    for j, best_r2 in enumerate(best_r2_scores):
        f.write(f"Best Model {j + 1} R2: {best_r2}\n")


# Maccs sigmoid

In [13]:
# Read the input Excel file
df = pd.read_excel('HOMO-LUMO-energies.xlsx')
smiles_list = df["Smiles"]
homo_energies = df["HOMO energy (eV)"]
lumo_energies = df["LUMO energy (eV)"]
dff_values = df["dFF"]

# Generate MACCS fingerprints, HOMO, and LUMO energies into a single input array
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
maccs_fingerprints = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]
fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(maccs_fingerprints, homo_energies, lumo_energies)]

# Read the input CSV file for blind test dataset
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_homo_energies = df_new["HOMO (eV)"]
new_lumo_energies = df_new["LUMO (eV)"]

# Generate MACCS fingerprints, HOMO, and LUMO energies for new molecules into a single input array
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_maccs_fingerprints = [MACCSkeys.GenMACCSKeys(mol) for mol in new_mols]
new_fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(new_maccs_fingerprints, new_homo_energies, new_lumo_energies)]

# Best models from training
best_models = []  # List to store the best models
num_best_models = 5  # Number of best models to save
best_r2_scores = [-float('inf')] * num_best_models

with open('out-SVR-sigmoid-maccs.txt', 'w') as f:
    for i in np.arange(0, 200):
        # Splitting dataset into train and test data
        x_train, x_test, y_train, y_test = train_test_split(fingerprints_array, dff_values, test_size=0.20, random_state=i)
        regressor = SVR(kernel='sigmoid')
        regressor.fit(x_train, y_train)

        # Evaluate the model on the testing dataset
        y_pred = regressor.predict(x_test)
        r2_score_value = r2_score(y_test, y_pred)

        # Save the best models
        for j, best_r2 in enumerate(best_r2_scores):
            if r2_score_value > best_r2:
                best_models.insert(j, regressor)
                best_r2_scores.insert(j, r2_score_value)
                del best_models[num_best_models:]  # Keep only the top 5 models
                del best_r2_scores[num_best_models:]  # Keep only the top 5 scores
                break

        print(f"Iteration {i + 1}: R2 Score = {r2_score_value}", file=f)

# Now 'best_models' contains the top 5 models with the highest R-squared scores.

# Use the best models to make predictions on new molecules
new_predictions_best = [model.predict(new_fingerprints_array) for model in best_models]

# Create DataFrames with molecule names and their corresponding predictions for each best model
result_dfs = []
for idx, predictions in enumerate(new_predictions_best):
    result_df = pd.DataFrame(predictions, columns=[f'Model_{idx + 1}_Prediction'])
    result_dfs.append(result_df)

# Concatenate DataFrames along rows
result_df = pd.concat(result_dfs, axis=1)

# Transpose the DataFrame
result_df_transposed = result_df.T

# Write the transposed DataFrame to a CSV file
result_df_transposed.to_csv('predicted_dff-sigmoid-maccs-en.csv', header=False)

# Save the R-squared values of the best models
with open('r2_values_best-sigmoid-maccs-en.txt', 'w') as f:
    for j, best_r2 in enumerate(best_r2_scores):
        f.write(f"Best Model {j + 1} R2: {best_r2}\n")


# Torsion sigmoid

In [14]:
# Read the input Excel file
df = pd.read_excel('HOMO-LUMO-energies.xlsx')
smiles_list = df["Smiles"]
homo_energies = df["HOMO energy (eV)"]
lumo_energies = df["LUMO energy (eV)"]
dff_values = df["dFF"]

# Generate Torsion fingerprints, HOMO, and LUMO energies into a single input array
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
torsion_fingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol) for mol in mols]
fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(torsion_fingerprints, homo_energies, lumo_energies)]

# Read the input CSV file for blind test dataset
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_homo_energies = df_new["HOMO (eV)"]
new_lumo_energies = df_new["LUMO (eV)"]

# Generate Torsion fingerprints, HOMO, and LUMO energies for new molecules into a single input array
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_torsion_fingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol) for mol in new_mols]
new_fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(new_torsion_fingerprints, new_homo_energies, new_lumo_energies)]

# Best models from training
best_models = []  # List to store the best models
num_best_models = 5  # Number of best models to save
best_r2_scores = [-float('inf')] * num_best_models

with open('out-SVR-sigmoid-torsion.txt', 'w') as f:
    for i in np.arange(0, 200):
        # Splitting dataset into train and test data
        x_train, x_test, y_train, y_test = train_test_split(fingerprints_array, dff_values, test_size=0.20, random_state=i)
        regressor = SVR(kernel='sigmoid')
        regressor.fit(x_train, y_train)

        # Evaluate the model on the testing dataset
        y_pred = regressor.predict(x_test)
        r2_score_value = r2_score(y_test, y_pred)

        # Save the best models
        for j, best_r2 in enumerate(best_r2_scores):
            if r2_score_value > best_r2:
                best_models.insert(j, regressor)
                best_r2_scores.insert(j, r2_score_value)
                del best_models[num_best_models:]  # Keep only the top 5 models
                del best_r2_scores[num_best_models:]  # Keep only the top 5 scores
                break

        print(f"Iteration {i + 1}: R2 Score = {r2_score_value}", file=f)

# Now 'best_models' contains the top 5 models with the highest R-squared scores.

# Use the best models to make predictions on new molecules
new_predictions_best = [model.predict(new_fingerprints_array) for model in best_models]

# Create DataFrames with molecule names and their corresponding predictions for each best model
result_dfs = []
for idx, predictions in enumerate(new_predictions_best):
    result_df = pd.DataFrame(predictions, columns=[f'Model_{idx + 1}_Prediction'])
    result_dfs.append(result_df)

# Concatenate DataFrames along rows
result_df = pd.concat(result_dfs, axis=1)

# Transpose the DataFrame
result_df_transposed = result_df.T

# Write the transposed DataFrame to a CSV file
result_df_transposed.to_csv('predicted_dff-sigmoid-torsion-en.csv', header=False)

# Save the R-squared values of the best models
with open('r2_values_best-sigmoid-torsion-en.txt', 'w') as f:
    for j, best_r2 in enumerate(best_r2_scores):
        f.write(f"Best Model {j + 1} R2: {best_r2}\n")

# Daylight linear

In [15]:
# Read the input Excel file
df = pd.read_excel('HOMO-LUMO-energies.xlsx')
smiles_list = df["Smiles"]
homo_energies = df["HOMO energy (eV)"]
lumo_energies = df["LUMO energy (eV)"]
dff_values = df["dFF"]

# Generate Daylight fingerprints, HOMO, and LUMO energies into a single input array
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
daylight_fingerprints = [Chem.RDKFingerprint(mol) for mol in mols]
fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(daylight_fingerprints, homo_energies, lumo_energies)]

# Read the input CSV file for blind test dataset
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_homo_energies = df_new["HOMO (eV)"]
new_lumo_energies = df_new["LUMO (eV)"]

# Generate Daylight fingerprints, HOMO, and LUMO energies for new molecules into a single input array
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_daylight_fingerprints = [Chem.RDKFingerprint(mol) for mol in new_mols]
new_fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(new_daylight_fingerprints, new_homo_energies, new_lumo_energies)]

# Best models from training
best_models = []  # List to store the best models
num_best_models = 5  # Number of best models to save
best_r2_scores = [-float('inf')] * num_best_models

with open('out-SVR-linear-daylight.txt', 'w') as f:
    for i in np.arange(0, 200):
        # Splitting dataset into train and test data
        x_train, x_test, y_train, y_test = train_test_split(fingerprints_array, dff_values, test_size=0.20, random_state=i)
        regressor = SVR(kernel='linear')
        regressor.fit(x_train, y_train)

        # Evaluate the model on the testing dataset
        y_pred = regressor.predict(x_test)
        r2_score_value = r2_score(y_test, y_pred)

        # Save the best models
        for j, best_r2 in enumerate(best_r2_scores):
            if r2_score_value > best_r2:
                best_models.insert(j, regressor)
                best_r2_scores.insert(j, r2_score_value)
                del best_models[num_best_models:]  # Keep only the top 5 models
                del best_r2_scores[num_best_models:]  # Keep only the top 5 scores
                break

        print(f"Iteration {i + 1}: R2 Score = {r2_score_value}", file=f)

# Now 'best_models' contains the top 5 models with the highest R-squared scores.

# Use the best models to make predictions on new molecules
new_predictions_best = [model.predict(new_fingerprints_array) for model in best_models]

# Create DataFrames with molecule names and their corresponding predictions for each best model
result_dfs = []
for idx, predictions in enumerate(new_predictions_best):
    result_df = pd.DataFrame(predictions, columns=[f'Model_{idx + 1}_Prediction'])
    result_dfs.append(result_df)

# Concatenate DataFrames along rows
result_df = pd.concat(result_dfs, axis=1)

# Transpose the DataFrame
result_df_transposed = result_df.T

# Write the transposed DataFrame to a CSV file
result_df_transposed.to_csv('predicted_dff-linear-daylight-en.csv', header=False)

# Save the R-squared values of the best models
with open('r2_values_best-linear-daylight-en.txt', 'w') as f:
    for j, best_r2 in enumerate(best_r2_scores):
        f.write(f"Best Model {j + 1} R2: {best_r2}\n")


# Atompairs sigmoid

In [16]:
# Read the input Excel file
df = pd.read_excel('HOMO-LUMO-energies.xlsx')
smiles_list = df["Smiles"]
homo_energies = df["HOMO energy (eV)"]
lumo_energies = df["LUMO energy (eV)"]
dff_values = df["dFF"]

# Generate AtomPairs fingerprints, HOMO, and LUMO energies into a single input array
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
atompairs_fingerprints = [AllChem.GetHashedAtomPairFingerprintAsBitVect(mol) for mol in mols]
fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(atompairs_fingerprints, homo_energies, lumo_energies)]

# Read the input CSV file for blind test dataset
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_homo_energies = df_new["HOMO (eV)"]
new_lumo_energies = df_new["LUMO (eV)"]

# Generate AtomPairs fingerprints, HOMO, and LUMO energies for new molecules into a single input array
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_atompairs_fingerprints = [AllChem.GetHashedAtomPairFingerprintAsBitVect(mol) for mol in new_mols]
new_fingerprints_array = [[*fp, homo, lumo] for fp, homo, lumo in zip(new_atompairs_fingerprints, new_homo_energies, new_lumo_energies)]

# Best models from training
best_models = []  # List to store the best models
num_best_models = 5  # Number of best models to save
best_r2_scores = [-float('inf')] * num_best_models

with open('out-SVR-sigmoid-atompairs.txt', 'w') as f:
    for i in np.arange(0, 200):
        # Splitting dataset into train and test data
        x_train, x_test, y_train, y_test = train_test_split(fingerprints_array, dff_values, test_size=0.20, random_state=i)
        regressor = SVR(kernel='sigmoid')
        regressor.fit(x_train, y_train)

        # Evaluate the model on the testing dataset
        y_pred = regressor.predict(x_test)
        r2_score_value = r2_score(y_test, y_pred)

        # Save the best models
        for j, best_r2 in enumerate(best_r2_scores):
            if r2_score_value > best_r2:
                best_models.insert(j, regressor)
                best_r2_scores.insert(j, r2_score_value)
                del best_models[num_best_models:]  # Keep only the top 5 models
                del best_r2_scores[num_best_models:]  # Keep only the top 5 scores
                break

        print(f"Iteration {i + 1}: R2 Score = {r2_score_value}", file=f)

# Now 'best_models' contains the top 5 models with the highest R-squared scores.

# Use the best models to make predictions on new molecules
new_predictions_best = [model.predict(new_fingerprints_array) for model in best_models]

# Create DataFrames with molecule names and their corresponding predictions for each best model
result_dfs = []
for idx, predictions in enumerate(new_predictions_best):
    result_df = pd.DataFrame(predictions, columns=[f'Model_{idx + 1}_Prediction'])
    result_dfs.append(result_df)

# Concatenate DataFrames along rows
result_df = pd.concat(result_dfs, axis=1)

# Transpose the DataFrame
result_df_transposed = result_df.T

# Write the transposed DataFrame to a CSV file
result_df_transposed.to_csv('predicted_dff-sigmoid-atompairs-en.csv', header=False)

# Save the R-squared values of the best models
with open('r2_values_best-sigmoid-atompairs-en.txt', 'w') as f:
    for j, best_r2 in enumerate(best_r2_scores):
        f.write(f"Best Model {j + 1} R2: {best_r2}\n")
