**1. Load the molecules and combined them**

In [1]:
!pip install rdkit-pypi


Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [3]:
# Import necessary libraries
import pandas as pd

# Define paths to the files and their corresponding model names
files_and_models = {
    'all_predictions_pocket_tox_docking.csv': 'Pocket2Mol',
    'all_predictions_drug_tox_docking.csv': 'DrugGPT',
    'all_predictions_diff_tox_docking.csv': 'DiffSBDD',
    '600_predictions_lingo_tox_docking.csv': 'Lingo3DMol',
    'all_predictions_rga_tox_docking.csv': 'RGA'
}
# files_and_models = {
#     'dyrk1a_tox_predictions.csv': 'RGA'
# }

# Load data and add a 'Model' column
def load_data(file_path, model_name):
    df = pd.read_csv(file_path)
    df['Model'] = model_name  # Add a column with the model name
    return df

# Use a dictionary comprehension to load all data and label it by model
data_frames = {model: load_data(file, model) for file, model in files_and_models.items()}

# Combine all DataFrames into one
combined_df = pd.concat(data_frames.values(), ignore_index=True)
print(f"Number of initial molecules : {len(combined_df)}")
# Calculate the distribution of molecules passing all filters using the 'Model' column
distribution_by_model = combined_df['Model'].value_counts()

# Print the distribution
print("Distribution of molecules passing all filters by model:")
print(distribution_by_model)


Number of initial molecules : 2249
Distribution of molecules passing all filters by model:
Model
Lingo3DMol    600
Pocket2Mol    562
RGA           416
DrugGPT       400
DiffSBDD      271
Name: count, dtype: int64


**2. Toxicity filter**

In [4]:
# List of specific toxicity columns provided
columnas_toxicidad = [
    'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
    'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'
]

# Filter the DataFrame: Keep rows where all toxicity values are less than 0.5
filtered_df = combined_df[combined_df[columnas_toxicidad].lt(0.5).all(axis=1)]

# Print the number of molecules remaining after applying the toxicity filter
print(f"Number of molecules remaining after toxicity filter: {len(filtered_df)}")
# Calculate the distribution of molecules passing all filters using the 'Model' column
distribution_by_model = filtered_df['Model'].value_counts()

# Print the distribution
print("Distribution of molecules passing all filters by model:")
print(distribution_by_model)


Number of molecules remaining after toxicity filter: 762
Distribution of molecules passing all filters by model:
Model
DiffSBDD      200
Lingo3DMol    171
Pocket2Mol    168
DrugGPT       162
RGA            61
Name: count, dtype: int64


**3. QED filter**

In [5]:
# Filter the DataFrame to include only rows where QED > 0.5
filtered_df = filtered_df[filtered_df['QED'] > 0.5]

# Print the number of molecules remaining after applying the toxicity and QED filters
print(f"Number of molecules remaining after applying toxicity and QED filters: {len(filtered_df)}")
# Calculate the distribution of molecules passing all filters using the 'Model' column
distribution_by_model = filtered_df['Model'].value_counts()

# Print the distribution
print("Distribution of molecules passing all filters by model:")
print(distribution_by_model)


Number of molecules remaining after applying toxicity and QED filters: 556
Distribution of molecules passing all filters by model:
Model
Pocket2Mol    161
Lingo3DMol    161
DrugGPT        96
DiffSBDD       85
RGA            53
Name: count, dtype: int64


**4. LogP filter**

In [6]:
# Filter the DataFrame to include only rows where LogP is between 0 and 5
filtered_df = filtered_df[(filtered_df['logP'] >= 1.5) & (filtered_df['logP'] <= 3.5)]

# Print the number of molecules remaining after applying all filters (toxicity, QED, and LogP)
print(f"Number of molecules remaining after applying filters for toxicity, QED and LogP: {len(filtered_df)}")
# Calculate the distribution of molecules passing all filters using the 'Model' column
distribution_by_model = filtered_df['Model'].value_counts()

# Print the distribution
print("Distribution of molecules passing all filters by model:")
print(distribution_by_model)


Number of molecules remaining after applying filters for toxicity, QED and LogP: 297
Distribution of molecules passing all filters by model:
Model
Pocket2Mol    97
Lingo3DMol    94
DrugGPT       57
DiffSBDD      28
RGA           21
Name: count, dtype: int64


**5. Similarity filter**

In [8]:
# Filter the DataFrame to include only rows where Similarity is less than 0.3
filtered_df = filtered_df[filtered_df['Similarity'] < 0.3]

# Print the number of molecules remaining after applying all filters including the Similarity filter
print(f"Number of molecules remaining after applying filters for toxicity, QED, LogP, and Similarity: {len(filtered_df)}")
# Calculate the distribution of molecules passing all filters using the 'Model' column
distribution_by_model = filtered_df['Model'].value_counts()

# Print the distribution
print("Distribution of molecules passing all filters by model:")
print(distribution_by_model)


Number of molecules remaining after applying filters for toxicity, QED, LogP, and Similarity: 206
Distribution of molecules passing all filters by model:
Model
Pocket2Mol    79
DrugGPT       51
Lingo3DMol    47
DiffSBDD      28
RGA            1
Name: count, dtype: int64


**6. Molecular weight filter**

In [9]:
# Filter the DataFrame to include only rows where Similarity is less than 500
filtered_df = filtered_df[(filtered_df['MolecularWeight'] >= 180) & (filtered_df['MolecularWeight'] <= 480)]

# Print the number of molecules remaining after applying all filters including the Similarity filter
print(f"Number of molecules remaining after applying filters for toxicity, QED, LogP, Similarity and Molecular Weight: {len(filtered_df)}")
# Calculate the distribution of molecules passing all filters using the 'Model' column
distribution_by_model = filtered_df['Model'].value_counts()

# Print the distribution
print("Distribution of molecules passing all filters by model:")
print(distribution_by_model)


Number of molecules remaining after applying filters for toxicity, QED, LogP, Similarity and Molecular Weight: 200
Distribution of molecules passing all filters by model:
Model
Pocket2Mol    79
DrugGPT       47
Lingo3DMol    47
DiffSBDD      26
RGA            1
Name: count, dtype: int64


**7. SAS filter**

In [10]:
filtered_df = filtered_df[filtered_df['SAS'] < 5]

# Print the number of molecules remaining after applying all filters including the Similarity filter
print(f"Number of molecules remaining after applying filters for toxicity, QED, LogP, Similarity, Molecular Weight and SAS: {len(filtered_df)}")
# Calculate the distribution of molecules passing all filters using the 'Model' column
distribution_by_model = filtered_df['Model'].value_counts()

# Print the distribution
print("Distribution of molecules passing all filters by model:")
print(distribution_by_model)

Number of molecules remaining after applying filters for toxicity, QED, LogP, Similarity, Molecular Weight and SAS: 186
Distribution of molecules passing all filters by model:
Model
Pocket2Mol    73
Lingo3DMol    47
DrugGPT       46
DiffSBDD      19
RGA            1
Name: count, dtype: int64


**8. HBA, HBD filter**

In [11]:
# Filter out molecules with more than 5 hydrogen bond donors (HBDs)
filtered_df = filtered_df[filtered_df['HBD'] <= 5]

# Filter out molecules with more than 10 hydrogen bond acceptors (HBAs)
filtered_df = filtered_df[filtered_df['HBA'] <= 10]

# Print the number of molecules remaining after applying Lipinski criteria
print(f"Number of molecules remaining after applying Lipinski criteria: {len(filtered_df)}")
# Calculate the distribution of molecules passing all filters using the 'Model' column
distribution_by_model = filtered_df['Model'].value_counts()

# Print the distribution
print("Distribution of molecules passing all filters by model:")
print(distribution_by_model)


Number of molecules remaining after applying Lipinski criteria: 186
Distribution of molecules passing all filters by model:
Model
Pocket2Mol    73
Lingo3DMol    47
DrugGPT       46
DiffSBDD      19
RGA            1
Name: count, dtype: int64
