In [None]:
!pip install rdkit-pypi
!pip install openpyxl
!pip install rdkit-pypi avalon_framework

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5
Collecting avalon_framework
  Downloading avalon_framework-1.8.2.tar.gz (3.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: avalon_framework
  Building wheel for avalon_framework (setup.py) ... [?25l[?25hdone
  Created wheel for avalon_framework: filename=avalon_framework-1.8.2-py3-none-any.whl size=3864 sha256=c1b72d5b58b8cb38e9c756045e51aa7356fe55e0d4adc945185b9026edb9f65b
  Stored in directory: /root/.cache/pip/wheels/78/3f/5c/a65bfa8ce94f62739865cf30e5687272ee719961b4311d05e3
Successfully built avalon_framework
Installing collected packages: avalon_framework
Successfully installed avalon_framework-1.8.2


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
#from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import learning_curve

# Morgan

In [None]:
# Load the Excel file containing SMILES, CLASS, HOMO, and LUMO data
df = pd.read_excel('HOMO-LUMO-energies.xlsx')  # Replace 'your_excel_file.xlsx' with the actual file name

# Define column names for better clarity
column_names = ["Cmpd Lab", "Smiles", "dFF", "HOMO (eV)", "LUMO (eV)"]

# Rename the DataFrame columns
df.columns = column_names

# Extract columns using descriptive variable names
smiles_column = df['Smiles']
ff_column = df['dFF']
homo_column = df['HOMO (eV)']
lumo_column = df['LUMO (eV)']

# Convert SMILES column to a list
smiles_list = smiles_column.to_list()

# Convert F/F columns to numeric arrays
ff_values = ff_column.to_numpy().astype(float)

# Convert HOMO and LUMO columns to numeric arrays
homo_values = homo_column.to_numpy().astype(float)
lumo_values = lumo_column.to_numpy().astype(float)

# Create RDKit Mol objects from SMILES
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

# Generate fingerprints using RDKit's Morgan fingerprint
fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]

# Create 'labels' array based on positive and negative numbers
df["Class"] = (df["dFF"] > 0.3).astype(int)

# Set a fixed random seed for reproducibility
seed_value = 45
np.random.seed(seed_value)

# Convert RDKit fingerprints to a list of lists
fingerprints_list = [list(fp.ToBitString()) for fp in fingerprints]

# Combine fingerprints, HOMO, and LUMO into features
features = np.column_stack((np.array(fingerprints_list), homo_values, lumo_values))

# Create a new DataFrame with molecular features and class labels
new_df = pd.DataFrame(data={"Class": df["Class"], "Features": list(features)})

# Split the data into features (X) and labels (y)
X = pd.DataFrame(new_df["Features"].to_list(), columns=[f"Feature_{i}" for i in range(2048 + 2)])  # Assuming 2048 fingerprint features
y = new_df["Class"]

print("Shape of X:", X.shape)


# Use fingerprints directly for features
#features = np.array(fingerprints_list)

# Initialize lists to store evaluation metrics and confusion matrices
confusion_matrix_list = []
metrics_list = []

num_models = 200

# Initialize variables to track the 5 best models within the specified f1 score range
top_models_indices = []
top_models_f1_scores = []
top_models_accuracies = []
top_models = []  # Store the models

f1_range_lower = 0.75
f1_range_upper = 0.9

# Run the model 200 times with different random states
for i in range(num_models):
    # Shuffle the data using a random number generator
    permutation = np.random.permutation(len(X))
    shuffled_X = X.iloc[permutation]
    shuffled_labels = new_df["Class"].iloc[permutation]

    # Split the shuffled features and labels into training and testing datasets
    x_train, x_test, y_train, y_test = train_test_split(shuffled_X, shuffled_labels, test_size=0.20, random_state=i)


    #print("Shape of X_train:", x_train.shape)


    # Initialize and train the SVC model
    model = SVC(probability=True)
    model.fit(x_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(x_test)
    y_prob = model.predict_proba(x_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Append metrics to the list
    metrics_list.append({"Model": i + 1, "Accuracy": accuracy, "F1 Score": f1, "Recall": recall})
    confusion_matrix_list.append(cm)

    # Update top models list if f1 score is within the specified range
    if f1_range_lower <= f1 <= f1_range_upper:
        top_models_indices.append(i)
        top_models_f1_scores.append(f1)
        top_models_accuracies.append(accuracy)
        top_models.append(model)  # Store the model

# Sort top models based on f1 score in descending order
sorted_top_models_indices = [idx for idx, _ in sorted(enumerate(top_models_f1_scores), key=lambda x: x[1], reverse=True)]
top_models_indices = [top_models_indices[idx] for idx in sorted_top_models_indices[:5]]

# Save the indices, f1 scores, and accuracies to a file for the top 5 models
top_models_info = pd.DataFrame({"Model Index": top_models_indices[:5], "F1 Score": top_models_f1_scores[:5], "Accuracy": top_models_accuracies[:5]})
top_models_info.to_csv("class-topmodels_info-seed45-Morgan-t3-energy.csv", index=False)

# For predicting new molecules
# Load new molecule features
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_molecule_names = df_new["Cmpd Label"]
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_Morgan_fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in new_mols]

# Extract HOMO and LUMO columns for new molecules
new_homo_values = df_new['HOMO (eV)'].astype(float)
new_lumo_values = df_new['LUMO (eV)'].astype(float)

# Combine fingerprints, HOMO, and LUMO for new molecules
new_features = np.column_stack((np.array(new_Morgan_fingerprints)[:, :2048], new_homo_values, new_lumo_values))


# Initialize predictions_df DataFrame with molecule names
predictions_df = pd.DataFrame({"Molecule Names": new_molecule_names})

# Iterate over the top 5 models
for idx, model in enumerate(top_models[:5]):
    # Use the model to predict classes for new molecules
    predictions = model.predict(new_features)

    # Save predictions to the DataFrame
    predictions_df[f'Model_{idx + 1}_Predictions'] = predictions

# Save predictions to a CSV file
predictions_df.to_csv("class-predictions-seed45-Morgan-t3-energy-AllModels.csv", index=False)








Shape of X: (63, 2050)


# MACCS

In [None]:
from rdkit.Chem import MACCSkeys

# Load the Excel file containing SMILES, CLASS, HOMO, and LUMO data
df = pd.read_excel('HOMO-LUMO-energies.xlsx')  # Replace 'your_excel_file.xlsx' with the actual file name

# Define column names for better clarity
column_names = ["Cmpd Lab", "Smiles", "dFF", "HOMO (eV)", "LUMO (eV)"]

# Rename the DataFrame columns
df.columns = column_names

# Extract columns using descriptive variable names
smiles_column = df['Smiles']
ff_column = df['dFF']
homo_column = df['HOMO (eV)']
lumo_column = df['LUMO (eV)']

# Convert SMILES column to a list
smiles_list = smiles_column.to_list()

# Convert F/F columns to numeric arrays
ff_values = ff_column.to_numpy().astype(float)

# Convert HOMO and LUMO columns to numeric arrays
homo_values = homo_column.to_numpy().astype(float)
lumo_values = lumo_column.to_numpy().astype(float)

# Create RDKit Mol objects from SMILES
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

# Generate fingerprints using RDKit's MACCS fingerprint
fingerprints = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]

# Create 'labels' array based on positive and negative numbers
df["Class"] = (df["dFF"] > 0.3).astype(int)

# Set a fixed random seed for reproducibility
seed_value = 45
np.random.seed(seed_value)

# Convert RDKit fingerprints to a list of lists
fingerprints_list = [list(fp.ToBitString()) for fp in fingerprints]

# Combine fingerprints, HOMO, and LUMO into features
features = np.column_stack((np.array(fingerprints_list), homo_values, lumo_values))

# Create a new DataFrame with molecular features and class labels
new_df = pd.DataFrame(data={"Class": df["Class"], "Features": list(features)})

# Split the data into features (X) and labels (y)
X = pd.DataFrame(new_df["Features"].to_list(), columns=[f"Feature_{i}" for i in range(167 + 2)])  # Assuming 167 MACCS fingerprint features
y = new_df["Class"]

print("Shape of X:", X.shape)

# Use fingerprints directly for features
#features = np.array(fingerprints_list)

# Initialize lists to store evaluation metrics and confusion matrices
confusion_matrix_list = []
metrics_list = []

num_models = 200

# Initialize variables to track the 5 best models within the specified f1 score range
top_models_indices = []
top_models_f1_scores = []
top_models_accuracies = []
top_models = []  # Store the models

f1_range_lower = 0.75
f1_range_upper = 0.9

# Run the model 200 times with different random states
for i in range(num_models):
    # Shuffle the data using a random number generator
    permutation = np.random.permutation(len(X))
    shuffled_X = X.iloc[permutation]
    shuffled_labels = new_df["Class"].iloc[permutation]

    # Split the shuffled features and labels into training and testing datasets
    x_train, x_test, y_train, y_test = train_test_split(shuffled_X, shuffled_labels, test_size=0.20, random_state=i)


    #print("Shape of X_train:", x_train.shape)


    # Initialize and train the SVC model
    model = SVC(probability=True)
    model.fit(x_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(x_test)
    y_prob = model.predict_proba(x_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Append metrics to the list
    metrics_list.append({"Model": i + 1, "Accuracy": accuracy, "F1 Score": f1, "Recall": recall})
    confusion_matrix_list.append(cm)

    # Update top models list if f1 score is within the specified range
    if f1_range_lower <= f1 <= f1_range_upper:
        top_models_indices.append(i)
        top_models_f1_scores.append(f1)
        top_models_accuracies.append(accuracy)
        top_models.append(model)  # Store the model

# Sort top models based on f1 score in descending order
sorted_top_models_indices = [idx for idx, _ in sorted(enumerate(top_models_f1_scores), key=lambda x: x[1], reverse=True)]
top_models_indices = [top_models_indices[idx] for idx in sorted_top_models_indices[:5]]

# Save the indices, f1 scores, and accuracies to a file for the top 5 models
top_models_info = pd.DataFrame({"Model Index": top_models_indices[:5], "F1 Score": top_models_f1_scores[:5], "Accuracy": top_models_accuracies[:5]})
top_models_info.to_csv("class-topmodels_info-seed45-Maccs-t3-energy.csv", index=False)

# For predicting new molecules
# Load new molecule features
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_molecule_names = df_new["Cmpd Label"]
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_maccs_fingerprints = [MACCSkeys.GenMACCSKeys(mol) for mol in new_mols]

# Extract HOMO and LUMO columns for new molecules
new_homo_values = df_new['HOMO (eV)'].astype(float)
new_lumo_values = df_new['LUMO (eV)'].astype(float)

# Combine fingerprints, HOMO, and LUMO for new molecules
new_features = np.column_stack((np.array(new_maccs_fingerprints)[:, :167], new_homo_values, new_lumo_values))


# Initialize predictions_df DataFrame with molecule names
predictions_df = pd.DataFrame({"Molecule Names": new_molecule_names})

# Iterate over the top 5 models
for idx, model in enumerate(top_models[:5]):
    # Use the model to predict classes for new molecules
    predictions = model.predict(new_features)

    # Save predictions to the DataFrame
    predictions_df[f'Model_{idx + 1}_Predictions'] = predictions

# Save predictions to a CSV file
predictions_df.to_csv("class-predictions-seed45-MACCS-t3-energy-AllModels.csv", index=False)




Shape of X: (63, 169)




# Daylight

In [None]:
from rdkit import Chem
from rdkit.Chem import DataStructs

# Load the Excel file containing SMILES, CLASS, HOMO, and LUMO data
df = pd.read_excel('HOMO-LUMO-energies.xlsx')  # Replace 'your_excel_file.xlsx' with the actual file name

# Define column names for better clarity
column_names = ["Cmpd Lab", "Smiles", "dFF", "HOMO (eV)", "LUMO (eV)"]

# Rename the DataFrame columns
df.columns = column_names

# Extract columns using descriptive variable names
smiles_column = df['Smiles']
ff_column = df['dFF']
homo_column = df['HOMO (eV)']
lumo_column = df['LUMO (eV)']

# Convert SMILES column to a list
smiles_list = smiles_column.to_list()

# Convert F/F columns to numeric arrays
ff_values = ff_column.to_numpy().astype(float)

# Convert HOMO and LUMO columns to numeric arrays
homo_values = homo_column.to_numpy().astype(float)
lumo_values = lumo_column.to_numpy().astype(float)

# Create RDKit Mol objects from SMILES
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

# Create RDKit Mol objects from SMILES
Daylight_fingerprints = [Chem.RDKFingerprint(mol) for mol in mols]

# Create 'labels' array based on positive and negative numbers
df["Class"] = (df["dFF"] > 0.3).astype(int)

# Set a fixed random seed for reproducibility
seed_value = 45
np.random.seed(seed_value)

# Convert RDKit fingerprints to a list of lists
fingerprints_list = [list(fp.ToBitString()) for fp in Daylight_fingerprints]

# Combine fingerprints, HOMO, and LUMO into features
features = np.column_stack((np.array(fingerprints_list), homo_values, lumo_values))

# Create a new DataFrame with molecular features and class labels
new_df = pd.DataFrame(data={"Class": df["Class"], "Features": list(features)})

# Split the data into features (X) and labels (y)
X = pd.DataFrame(new_df["Features"].to_list(), columns=[f"Feature_{i}" for i in range(2048 + 2)])
y = new_df["Class"]

print("Shape of X:", X.shape)

# Use fingerprints directly for features
#features = np.array(fingerprints_list)

# Initialize lists to store evaluation metrics and confusion matrices
confusion_matrix_list = []
metrics_list = []

num_models = 200

# Initialize variables to track the 5 best models within the specified f1 score range
top_models_indices = []
top_models_f1_scores = []
top_models_accuracies = []
top_models = []  # Store the models

f1_range_lower = 0.75
f1_range_upper = 0.9

# Run the model 200 times with different random states
for i in range(num_models):
    # Shuffle the data using a random number generator
    permutation = np.random.permutation(len(X))
    shuffled_X = X.iloc[permutation]
    shuffled_labels = new_df["Class"].iloc[permutation]

    # Split the shuffled features and labels into training and testing datasets
    x_train, x_test, y_train, y_test = train_test_split(shuffled_X, shuffled_labels, test_size=0.20, random_state=i)


    #print("Shape of X_train:", x_train.shape)


    # Initialize and train the SVC model
    model = SVC(probability=True)
    model.fit(x_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(x_test)
    y_prob = model.predict_proba(x_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Append metrics to the list
    metrics_list.append({"Model": i + 1, "Accuracy": accuracy, "F1 Score": f1, "Recall": recall})
    confusion_matrix_list.append(cm)

    # Update top models list if f1 score is within the specified range
    if f1_range_lower <= f1 <= f1_range_upper:
        top_models_indices.append(i)
        top_models_f1_scores.append(f1)
        top_models_accuracies.append(accuracy)
        top_models.append(model)  # Store the model

# Sort top models based on f1 score in descending order
sorted_top_models_indices = [idx for idx, _ in sorted(enumerate(top_models_f1_scores), key=lambda x: x[1], reverse=True)]
top_models_indices = [top_models_indices[idx] for idx in sorted_top_models_indices[:5]]

# Save the indices, f1 scores, and accuracies to a file for the top 5 models
top_models_info = pd.DataFrame({"Model Index": top_models_indices[:5], "F1 Score": top_models_f1_scores[:5], "Accuracy": top_models_accuracies[:5]})
top_models_info.to_csv("class-topmodels_info-seed45-Daylight-t3-energy.csv", index=False)

# For predicting new molecules
# Load new molecule features
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_molecule_names = df_new["Cmpd Label"]
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_Daylight_fingerprints = [Chem.RDKFingerprint(mol) for mol in new_mols]

# Extract HOMO and LUMO columns for new molecules
new_homo_values = df_new['HOMO (eV)'].astype(float)
new_lumo_values = df_new['LUMO (eV)'].astype(float)

# Combine fingerprints, HOMO, and LUMO for new molecules
new_features = np.column_stack((np.array(new_Daylight_fingerprints)[:, :2048], new_homo_values, new_lumo_values))


# Initialize predictions_df DataFrame
predictions_df = pd.DataFrame({"Molecule Names": new_molecule_names, "Molecule SMILES": new_smiles_list})

# Iterate over the top 5 models
for idx, model in enumerate(top_models[:5]):
    # Assuming X_train is a DataFrame with named columns
    feature_names = X.columns.tolist()
    new_features_df = pd.DataFrame(new_features, columns=feature_names)

    # Use the model to predict classes for new molecules
    predictions = model.predict(new_features_df)

    # Save predictions to the DataFrame
    predictions_df[f'Model_{idx + 1}_Predictions'] = predictions

    # Save predictions to a CSV file
    model_predictions_filename = f"class-predictions-seed45-daylight-t3-energy-Model{top_models_indices[idx] + 1}.csv"
    predictions_df[['Molecule Names', f'Model_{idx + 1}_Predictions']].to_csv(model_predictions_filename, index=False)




Shape of X: (63, 2050)


KeyboardInterrupt: 

# Atompairs

In [None]:
from rdkit.Chem import rdMolDescriptors

# Load the Excel file containing SMILES, CLASS, HOMO, and LUMO data
df = pd.read_excel('HOMO-LUMO-energies.xlsx')  # Replace 'your_excel_file.xlsx' with the actual file name

# Define column names for better clarity
column_names = ["Cmpd Lab", "Smiles", "dFF", "HOMO (eV)", "LUMO (eV)"]

# Rename the DataFrame columns
df.columns = column_names

# Extract columns using descriptive variable names
smiles_column = df['Smiles']
ff_column = df['dFF']
homo_column = df['HOMO (eV)']
lumo_column = df['LUMO (eV)']

# Convert SMILES column to a list
smiles_list = smiles_column.to_list()

# Convert F/F columns to numeric arrays
ff_values = ff_column.to_numpy().astype(float)

# Convert HOMO and LUMO columns to numeric arrays
homo_values = homo_column.to_numpy().astype(float)
lumo_values = lumo_column.to_numpy().astype(float)

# Create RDKit Mol objects from SMILES
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
atom_pairs_fingerprints = [rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol) for mol in mols]

# Create 'labels' array based on positive and negative numbers
df["Class"] = (df["dFF"] > 0.3).astype(int)

# Set a fixed random seed for reproducibility
seed_value = 45
np.random.seed(seed_value)

# Convert RDKit fingerprints to a list of lists
fingerprints_list = [list(fp.ToBitString()) for fp in atom_pairs_fingerprints]

# Combine fingerprints, HOMO, and LUMO into features
features = np.column_stack((np.array(fingerprints_list), homo_values, lumo_values))

# Create a new DataFrame with molecular features and class labels
new_df = pd.DataFrame(data={"Class": df["Class"], "Features": list(features)})

# Split the data into features (X) and labels (y)
X = pd.DataFrame(new_df["Features"].to_list(), columns=[f"Feature_{i}" for i in range(2048 + 2)])
y = new_df["Class"]

print("Shape of X:", X.shape)

# Use fingerprints directly for features
#features = np.array(fingerprints_list)

# Initialize lists to store evaluation metrics and confusion matrices
confusion_matrix_list = []
metrics_list = []

num_models = 200

# Initialize variables to track the 5 best models within the specified f1 score range
top_models_indices = []
top_models_f1_scores = []
top_models_accuracies = []
top_models = []  # Store the models

f1_range_lower = 0.75
f1_range_upper = 0.9

# Run the model 200 times with different random states
for i in range(num_models):
    # Shuffle the data using a random number generator
    permutation = np.random.permutation(len(X))
    shuffled_X = X.iloc[permutation]
    shuffled_labels = new_df["Class"].iloc[permutation]

    # Split the shuffled features and labels into training and testing datasets
    x_train, x_test, y_train, y_test = train_test_split(shuffled_X, shuffled_labels, test_size=0.20, random_state=i)


    #print("Shape of X_train:", x_train.shape)


    # Initialize and train the SVC model
    model = SVC(probability=True)
    model.fit(x_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(x_test)
    y_prob = model.predict_proba(x_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Append metrics to the list
    metrics_list.append({"Model": i + 1, "Accuracy": accuracy, "F1 Score": f1, "Recall": recall})
    confusion_matrix_list.append(cm)

    # Update top models list if f1 score is within the specified range
    if f1_range_lower <= f1 <= f1_range_upper:
        top_models_indices.append(i)
        top_models_f1_scores.append(f1)
        top_models_accuracies.append(accuracy)
        top_models.append(model)  # Store the model

# Sort top models based on f1 score in descending order
sorted_top_models_indices = [idx for idx, _ in sorted(enumerate(top_models_f1_scores), key=lambda x: x[1], reverse=True)]
top_models_indices = [top_models_indices[idx] for idx in sorted_top_models_indices[:5]]

# Save the indices, f1 scores, and accuracies to a file for the top 5 models
top_models_info = pd.DataFrame({"Model Index": top_models_indices[:5], "F1 Score": top_models_f1_scores[:5], "Accuracy": top_models_accuracies[:5]})
top_models_info.to_csv("class-topmodels_info-seed45-atompairs-t3-energy.csv", index=False)

# For predicting new molecules
# Load new molecule features
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_molecule_names = df_new["Cmpd Label"]
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_atom_pairs_fingerprints = [rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol) for mol in new_mols]

# Extract HOMO and LUMO columns for new molecules
new_homo_values = df_new['HOMO (eV)'].astype(float)
new_lumo_values = df_new['LUMO (eV)'].astype(float)

# Combine fingerprints, HOMO, and LUMO for new molecules
new_features = np.column_stack((np.array(new_atom_pairs_fingerprints)[:, :2048], new_homo_values, new_lumo_values))


# Initialize predictions_df DataFrame
predictions_df = pd.DataFrame({"Molecule Names": new_molecule_names, "Molecule SMILES": new_smiles_list})

# Iterate over the top 5 models
for idx, model in enumerate(top_models[:5]):
    # Assuming X_train is a DataFrame with named columns
    feature_names = X.columns.tolist()
    new_features_df = pd.DataFrame(new_features, columns=feature_names)

    # Use the model to predict classes for new molecules
    predictions = model.predict(new_features_df)

    # Save predictions to the DataFrame
    predictions_df[f'Model_{idx + 1}_Predictions'] = predictions

    # Save predictions to a CSV file
    model_predictions_filename = f"class-predictions-seed45-atompairs-t3-energy-Model{top_models_indices[idx] + 1}.csv"
    predictions_df[['Molecule Names', f'Model_{idx + 1}_Predictions']].to_csv(model_predictions_filename, index=False)




Shape of X: (63, 2050)


# Avalon

In [None]:
from rdkit import Chem
from rdkit.Avalon import pyAvalonTools

# Load the Excel file containing SMILES, CLASS, HOMO, and LUMO data
df = pd.read_excel('HOMO-LUMO-energies.xlsx')  # Replace 'your_excel_file.xlsx' with the actual file name

# Define column names for better clarity
column_names = ["Cmpd Lab", "Smiles", "dFF", "HOMO (eV)", "LUMO (eV)"]

# Rename the DataFrame columns
df.columns = column_names

# Extract columns using descriptive variable names
smiles_column = df['Smiles']
ff_column = df['dFF']
homo_column = df['HOMO (eV)']
lumo_column = df['LUMO (eV)']

# Convert SMILES column to a list
smiles_list = smiles_column.to_list()

# Convert F/F columns to numeric arrays
ff_values = ff_column.to_numpy().astype(float)

# Convert HOMO and LUMO columns to numeric arrays
homo_values = homo_column.to_numpy().astype(float)
lumo_values = lumo_column.to_numpy().astype(float)

# Create RDKit Mol objects from SMILES
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
Avalon_fingerprints = [pyAvalonTools.GetAvalonFP(mol) for mol in mols]

# Create 'labels' array based on positive and negative numbers
df["Class"] = (df["dFF"] > 0.3).astype(int)

# Set a fixed random seed for reproducibility
seed_value = 45
np.random.seed(seed_value)

# Convert RDKit fingerprints to a list of lists
fingerprints_list = [list(fp.ToBitString()) for fp in Avalon_fingerprints]

# Combine fingerprints, HOMO, and LUMO into features
features = np.column_stack((np.array(fingerprints_list), homo_values, lumo_values))

# Create a new DataFrame with molecular features and class labels
new_df = pd.DataFrame(data={"Class": df["Class"], "Features": list(features)})

# Split the data into features (X) and labels (y)
X = pd.DataFrame(new_df["Features"].to_list(), columns=[f"Feature_{i}" for i in range(512 + 2)])
y = new_df["Class"]

print("Shape of X:", X.shape)

# Use fingerprints directly for features
#features = np.array(fingerprints_list)

# Initialize lists to store evaluation metrics and confusion matrices
confusion_matrix_list = []
metrics_list = []

num_models = 200

# Initialize variables to track the 5 best models within the specified f1 score range
top_models_indices = []
top_models_f1_scores = []
top_models_accuracies = []
top_models = []  # Store the models

f1_range_lower = 0.75
f1_range_upper = 0.9

# Run the model 200 times with different random states
for i in range(num_models):
    # Shuffle the data using a random number generator
    permutation = np.random.permutation(len(X))
    shuffled_X = X.iloc[permutation]
    shuffled_labels = new_df["Class"].iloc[permutation]

    # Split the shuffled features and labels into training and testing datasets
    x_train, x_test, y_train, y_test = train_test_split(shuffled_X, shuffled_labels, test_size=0.20, random_state=i)


    #print("Shape of X_train:", x_train.shape)


    # Initialize and train the SVC model
    model = SVC(probability=True)
    model.fit(x_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(x_test)
    y_prob = model.predict_proba(x_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Append metrics to the list
    metrics_list.append({"Model": i + 1, "Accuracy": accuracy, "F1 Score": f1, "Recall": recall})
    confusion_matrix_list.append(cm)

    # Update top models list if f1 score is within the specified range
    if f1_range_lower <= f1 <= f1_range_upper:
        top_models_indices.append(i)
        top_models_f1_scores.append(f1)
        top_models_accuracies.append(accuracy)
        top_models.append(model)  # Store the model

# Sort top models based on f1 score in descending order
sorted_top_models_indices = [idx for idx, _ in sorted(enumerate(top_models_f1_scores), key=lambda x: x[1], reverse=True)]
top_models_indices = [top_models_indices[idx] for idx in sorted_top_models_indices[:5]]

# Save the indices, f1 scores, and accuracies to a file for the top 5 models
top_models_info = pd.DataFrame({"Model Index": top_models_indices[:5], "F1 Score": top_models_f1_scores[:5], "Accuracy": top_models_accuracies[:5]})
top_models_info.to_csv("class-topmodels_info-seed45-avalon-t3-energy.csv", index=False)

# For predicting new molecules
# Load new molecule features
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_molecule_names = df_new["Cmpd Label"]
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_Avalon_fingerprints = [pyAvalonTools.GetAvalonFP(mol) for mol in new_mols]

# Extract HOMO and LUMO columns for new molecules
new_homo_values = df_new['HOMO (eV)'].astype(float)
new_lumo_values = df_new['LUMO (eV)'].astype(float)

# Combine fingerprints, HOMO, and LUMO for new molecules
new_features = np.column_stack((np.array(new_Avalon_fingerprints)[:, :2048], new_homo_values, new_lumo_values))


# Initialize predictions_df DataFrame
predictions_df = pd.DataFrame({"Molecule Names": new_molecule_names, "Molecule SMILES": new_smiles_list})

# Iterate over the top 5 models
for idx, model in enumerate(top_models[:5]):
    # Assuming X_train is a DataFrame with named columns
    feature_names = X.columns.tolist()
    new_features_df = pd.DataFrame(new_features, columns=feature_names)

    # Use the model to predict classes for new molecules
    predictions = model.predict(new_features_df)

    # Save predictions to the DataFrame
    predictions_df[f'Model_{idx + 1}_Predictions'] = predictions

    # Save predictions to a CSV file
    model_predictions_filename = f"class-predictions-seed45-avalon-t3-energy-Model{top_models_indices[idx] + 1}.csv"
    predictions_df[['Molecule Names', f'Model_{idx + 1}_Predictions']].to_csv(model_predictions_filename, index=False)




Shape of X: (63, 514)


In [None]:
from rdkit.Chem import rdMolDescriptors

# Load the Excel file containing SMILES, CLASS, HOMO, and LUMO data
df = pd.read_excel('HOMO-LUMO-energies.xlsx')  # Replace 'your_excel_file.xlsx' with the actual file name

# Define column names for better clarity
column_names = ["Cmpd Lab", "Smiles", "dFF", "HOMO (eV)", "LUMO (eV)"]

# Rename the DataFrame columns
df.columns = column_names

# Extract columns using descriptive variable names
smiles_column = df['Smiles']
ff_column = df['dFF']
homo_column = df['HOMO (eV)']
lumo_column = df['LUMO (eV)']

# Convert SMILES column to a list
smiles_list = smiles_column.to_list()

# Convert F/F columns to numeric arrays
ff_values = ff_column.to_numpy().astype(float)

# Convert HOMO and LUMO columns to numeric arrays
homo_values = homo_column.to_numpy().astype(float)
lumo_values = lumo_column.to_numpy().astype(float)

# Create RDKit Mol objects from SMILES
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
torsion_fingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol) for mol in mols]

# Create 'labels' array based on positive and negative numbers
df["Class"] = (df["dFF"] > 0.3).astype(int)

# Set a fixed random seed for reproducibility
seed_value = 45
np.random.seed(seed_value)

# Convert RDKit fingerprints to a list of lists
fingerprints_list = [list(fp.ToBitString()) for fp in torsion_fingerprints]

# Combine fingerprints, HOMO, and LUMO into features
features = np.column_stack((np.array(fingerprints_list), homo_values, lumo_values))

# Create a new DataFrame with molecular features and class labels
new_df = pd.DataFrame(data={"Class": df["Class"], "Features": list(features)})

# Split the data into features (X) and labels (y)
X = pd.DataFrame(new_df["Features"].to_list(), columns=[f"Feature_{i}" for i in range(2048 + 2)])
y = new_df["Class"]

print("Shape of X:", X.shape)

# Use fingerprints directly for features
#features = np.array(fingerprints_list)

# Initialize lists to store evaluation metrics and confusion matrices
confusion_matrix_list = []
metrics_list = []

num_models = 200

# Initialize variables to track the 5 best models within the specified f1 score range
top_models_indices = []
top_models_f1_scores = []
top_models_accuracies = []
top_models = []  # Store the models

f1_range_lower = 0.75
f1_range_upper = 0.9

# Run the model 200 times with different random states
for i in range(num_models):
    # Shuffle the data using a random number generator
    permutation = np.random.permutation(len(X))
    shuffled_X = X.iloc[permutation]
    shuffled_labels = new_df["Class"].iloc[permutation]

    # Split the shuffled features and labels into training and testing datasets
    x_train, x_test, y_train, y_test = train_test_split(shuffled_X, shuffled_labels, test_size=0.20, random_state=i)


    #print("Shape of X_train:", x_train.shape)


    # Initialize and train the SVC model
    model = SVC(probability=True)
    model.fit(x_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(x_test)
    y_prob = model.predict_proba(x_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Append metrics to the list
    metrics_list.append({"Model": i + 1, "Accuracy": accuracy, "F1 Score": f1, "Recall": recall})
    confusion_matrix_list.append(cm)

    # Update top models list if f1 score is within the specified range
    if f1_range_lower <= f1 <= f1_range_upper:
        top_models_indices.append(i)
        top_models_f1_scores.append(f1)
        top_models_accuracies.append(accuracy)
        top_models.append(model)  # Store the model

# Sort top models based on f1 score in descending order
sorted_top_models_indices = [idx for idx, _ in sorted(enumerate(top_models_f1_scores), key=lambda x: x[1], reverse=True)]
top_models_indices = [top_models_indices[idx] for idx in sorted_top_models_indices[:5]]

# Save the indices, f1 scores, and accuracies to a file for the top 5 models
top_models_info = pd.DataFrame({"Model Index": top_models_indices[:5], "F1 Score": top_models_f1_scores[:5], "Accuracy": top_models_accuracies[:5]})
top_models_info.to_csv("class-topmodels_info-seed45-torsion-t3-energy.csv", index=False)

# For predicting new molecules
# Load new molecule features
df_new = pd.read_csv('INPUT-NEW-MOLS-correct-HOMOLUMO.csv')
new_smiles_list = df_new["Smiles"]
new_molecule_names = df_new["Cmpd Label"]
new_mols = [Chem.MolFromSmiles(smiles) for smiles in new_smiles_list]
new_torsion_fingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol) for mol in new_mols]

# Extract HOMO and LUMO columns for new molecules
new_homo_values = df_new['HOMO (eV)'].astype(float)
new_lumo_values = df_new['LUMO (eV)'].astype(float)

# Combine fingerprints, HOMO, and LUMO for new molecules
new_features = np.column_stack((np.array(new_torsion_fingerprints)[:, :2048], new_homo_values, new_lumo_values))


# Initialize predictions_df DataFrame
predictions_df = pd.DataFrame({"Molecule Names": new_molecule_names, "Molecule SMILES": new_smiles_list})

# Iterate over the top 5 models
for idx, model in enumerate(top_models[:5]):
    # Assuming X_train is a DataFrame with named columns
    feature_names = X.columns.tolist()
    new_features_df = pd.DataFrame(new_features, columns=feature_names)

    # Use the model to predict classes for new molecules
    predictions = model.predict(new_features_df)

    # Save predictions to the DataFrame
    predictions_df[f'Model_{idx + 1}_Predictions'] = predictions

    # Save predictions to a CSV file
    model_predictions_filename = f"class-predictions-seed45-torsion-t3-energy-Model{top_models_indices[idx] + 1}.csv"
    predictions_df[['Molecule Names', f'Model_{idx + 1}_Predictions']].to_csv(model_predictions_filename, index=False)




Shape of X: (63, 2050)
