**1. Dependencies instalation**

In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.5


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import numpy as np
import pandas as pd

# Cargar los datos
basedyrk1a_df = pd.read_csv('basedyrk1a.csv', encoding='latin-1', sep=';')
drug_df = pd.read_csv('all_predictions_drug_tox_docking.csv')
diff_df = pd.read_csv('all_predictions_diff_tox_docking.csv')
rga_df = pd.read_csv('all_predictions_rga_tox_docking.csv')
pocket_df = pd.read_csv('all_predictions_pocket_tox_docking.csv')
lingo_df = pd.read_csv('600_predictions_lingo_tox_docking.csv')

**2. Tanimoto Similarity**

In [None]:




# Función para calcular las fingerprints de las moléculas
def get_fingerprints(smiles_list):
    fps = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol:  # Verificar que el mol no sea None
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
            fps.append(fp)
    return fps

# Función para añadir la columna de similitud máxima
def add_max_similarity_column(df, target_fps):
    max_similarities = []
    for smiles in df['SMILES']:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
            similarities = [DataStructs.TanimotoSimilarity(fp, target_fp) for target_fp in target_fps]
            max_similarity = max(similarities) if similarities else 0
            max_similarities.append(max_similarity)
        else:
            max_similarities.append(0)
    df['Similarity'] = max_similarities

# Calcular fingerprints de la base DYRK1A
basedyrk1a_fps = get_fingerprints(basedyrk1a_df['SMILES'].tolist())

# Actualizar dataframes con la columna de similitud máxima
add_max_similarity_column(drug_df, basedyrk1a_fps)
add_max_similarity_column(diff_df, basedyrk1a_fps)
add_max_similarity_column(rga_df, basedyrk1a_fps)
add_max_similarity_column(pocket_df, basedyrk1a_fps)
add_max_similarity_column(lingo_df, basedyrk1a_fps)



In [None]:
# Función para calcular el porcentaje de moléculas con similitud > 0.85
def calculate_percentage_high_similarity(df):
    count_high_similarity = (df['Similarity'] > 0.85).sum()
    total_molecules = len(df)
    percentage = (count_high_similarity / total_molecules) * 100
    return percentage

# Calcular y mostrar los porcentajes para cada modelo
percentage_drug = calculate_percentage_high_similarity(drug_df)
percentage_diff = calculate_percentage_high_similarity(diff_df)
percentage_rga = calculate_percentage_high_similarity(rga_df)
percentage_pocket = calculate_percentage_high_similarity(pocket_df)
percentage_lingo = calculate_percentage_high_similarity(lingo_df)

print(f"Para el modelo DrugGPT, el {percentage_drug:.2f}% de las moléculas tienen una similitud > 0.85 con la base de datos DYRK1A.")
print(f"Para el modelo Diff, el {percentage_diff:.2f}% de las moléculas tienen una similitud > 0.85 con la base de datos DYRK1A.")
print(f"Para el modelo RGA, el {percentage_rga:.2f}% de las moléculas tienen una similitud > 0.85 con la base de datos DYRK1A.")
print(f"Para el modelo Pocket2Mol, el {percentage_pocket:.2f}% de las moléculas tienen una similitud > 0.85 con la base de datos DYRK1A.")
print(f"Para el modelo Lingo, el {percentage_lingo:.2f}% de las moléculas tienen una similitud > 0.85 con la base de datos DYRK1A.")


Para el modelo DrugGPT, el 0.00% de las moléculas tienen una similitud > 0.85 con la base de datos DYRK1A.
Para el modelo Diff, el 0.00% de las moléculas tienen una similitud > 0.85 con la base de datos DYRK1A.
Para el modelo RGA, el 14.42% de las moléculas tienen una similitud > 0.85 con la base de datos DYRK1A.
Para el modelo Pocket2Mol, el 0.00% de las moléculas tienen una similitud > 0.85 con la base de datos DYRK1A.
Para el modelo Lingo, el 0.33% de las moléculas tienen una similitud > 0.85 con la base de datos DYRK1A.


**3.Molecular weight**

In [None]:
from rdkit.Chem import Descriptors

# Función para añadir la columna de peso molecular
def add_molecular_weight_column(df):
    molecular_weights = []
    for smiles in df['SMILES']:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            weight = Descriptors.MolWt(mol)
            molecular_weights.append(weight)
        else:
            molecular_weights.append(0)  # Asumir peso 0 si la molécula no se puede generar
    df['MolecularWeight'] = molecular_weights

# Añadir columna de peso molecular a cada dataframe
add_molecular_weight_column(drug_df)
add_molecular_weight_column(diff_df)
add_molecular_weight_column(rga_df)
add_molecular_weight_column(pocket_df)
add_molecular_weight_column(lingo_df)

# Función para calcular la media de los pesos moleculares
def calculate_average_molecular_weight(df):
    average_weight = df['MolecularWeight'].mean()
    return average_weight

# Calcular y mostrar la media de pesos moleculares para cada modelo
avg_weight_drug = calculate_average_molecular_weight(drug_df)
avg_weight_diff = calculate_average_molecular_weight(diff_df)
avg_weight_rga = calculate_average_molecular_weight(rga_df)
avg_weight_pocket = calculate_average_molecular_weight(pocket_df)
avg_weight_lingo = calculate_average_molecular_weight(lingo_df)

print(f"El peso molecular medio para el modelo DrugGPT es: {avg_weight_drug:.2f}")
print(f"El peso molecular medio para el modelo Diff es: {avg_weight_diff:.2f}")
print(f"El peso molecular medio para el modelo RGA es: {avg_weight_rga:.2f}")
print(f"El peso molecular medio para el modelo Pocket2Mol es: {avg_weight_pocket:.2f}")
print(f"El peso molecular medio para el modelo Lingo es: {avg_weight_lingo:.2f}")


El peso molecular medio para el modelo DrugGPT es: 418.80
El peso molecular medio para el modelo Diff es: 268.67
El peso molecular medio para el modelo RGA es: 314.41
El peso molecular medio para el modelo Pocket2Mol es: 309.73
El peso molecular medio para el modelo Lingo es: 335.16


**SAS**

In [None]:
!wget https://raw.githubusercontent.com/rdkit/rdkit/master/Contrib/SA_Score/sascorer.py

from rdkit import Chem
import pandas as pd
import sascorer

# Función para añadir la columna de SAS
def add_sas_column(df):
    sas_scores = []
    for smiles in df['SMILES']:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            sas_score = sascorer.calculateScore(mol)
            sas_scores.append(sas_score)
        else:
            sas_scores.append(None)  # Asumir None si la molécula no se puede generar
    df['SAS'] = sas_scores

# Añadir columna de SAS a cada dataframe
add_sas_column(drug_df)
add_sas_column(diff_df)
add_sas_column(rga_df)
add_sas_column(pocket_df)
add_sas_column(lingo_df)

# Función para calcular la media de los SAS
def calculate_average_sas(df):
    average_sas = df['SAS'].dropna().mean()  # Ignorar valores None
    return average_sas

# Calcular y mostrar la media de SAS para cada modelo
avg_sas_drug = calculate_average_sas(drug_df)
avg_sas_diff = calculate_average_sas(diff_df)
avg_sas_rga = calculate_average_sas(rga_df)
avg_sas_pocket = calculate_average_sas(pocket_df)
avg_sas_lingo = calculate_average_sas(lingo_df)

print(f"El SAS medio para el modelo DrugGPT es: {avg_sas_drug:.2f}")
print(f"El SAS medio para el modelo Diff es: {avg_sas_diff:.2f}")
print(f"El SAS medio para el modelo RGA es: {avg_sas_rga:.2f}")
print(f"El SAS medio para el modelo Pocket2Mol es: {avg_sas_pocket:.2f}")
print(f"El SAS medio para el modelo Lingo es: {avg_sas_lingo:.2f}")




--2024-04-17 09:41:39--  https://raw.githubusercontent.com/rdkit/rdkit/master/Contrib/SA_Score/sascorer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5568 (5.4K) [text/plain]
Saving to: ‘sascorer.py.1’


2024-04-17 09:41:39 (48.3 MB/s) - ‘sascorer.py.1’ saved [5568/5568]

El SAS medio para el modelo DrugGPT es: 2.57
El SAS medio para el modelo Diff es: 4.76
El SAS medio para el modelo RGA es: 3.52
El SAS medio para el modelo Pocket2Mol es: 3.25
El SAS medio para el modelo Lingo es: 2.83


**Ring**

In [None]:
from rdkit import Chem
import pandas as pd

# Function to add the maximum ring size to the DataFrame
def add_max_ring_size_column(df):
    # Create a new column 'Max Ring' where each entry is the maximum ring size of that molecule
    df['Max Ring'] = df['SMILES'].apply(lambda smiles: get_max_ring_size(smiles))

def get_max_ring_size(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        ri = mol.GetRingInfo()
        sizes = [len(ring) for ring in ri.AtomRings()]  # List of ring sizes for this molecule
        return max(sizes) if sizes else 0  # Return max size or 0 if no rings
    return None  # Return None if the molecule could not be parsed

# Function to calculate the distribution of ring sizes from a list of ring sizes
def calculate_ring_size_distribution(ring_sizes):
    ring_size_counts = pd.Series(ring_sizes).value_counts(normalize=True).sort_index()
    return ring_size_counts

# Function to get a flat list of all ring sizes across all molecules in the DataFrame
def get_all_ring_sizes(df):
    all_ring_sizes = []
    for smiles in df['SMILES']:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            ri = mol.GetRingInfo()
            sizes = [len(ring) for ring in ri.AtomRings()]  # List of ring sizes for this molecule
            all_ring_sizes.extend(sizes)  # Extend the list with the ring sizes of this molecule
    return all_ring_sizes

# Example usage:
# Load your DataFrames here (e.g., drug_df, diff_df, etc.)

# Add 'Max Ring' column to each DataFrame
add_max_ring_size_column(drug_df)
add_max_ring_size_column(diff_df)
add_max_ring_size_column(rga_df)
add_max_ring_size_column(pocket_df)
add_max_ring_size_column(lingo_df)

# Get a flat list of all ring sizes from each DataFrame
drug_ring_sizes = get_all_ring_sizes(drug_df)
diff_ring_sizes = get_all_ring_sizes(diff_df)
rga_ring_sizes = get_all_ring_sizes(rga_df)
pocket_ring_sizes = get_all_ring_sizes(pocket_df)
lingo_ring_sizes = get_all_ring_sizes(lingo_df)

# Calculate the distribution of ring sizes
drug_size_distribution = calculate_ring_size_distribution(drug_ring_sizes)
diff_size_distribution = calculate_ring_size_distribution(diff_ring_sizes)
rga_size_distribution = calculate_ring_size_distribution(rga_ring_sizes)
pocket_size_distribution = calculate_ring_size_distribution(pocket_ring_sizes)
lingo_size_distribution = calculate_ring_size_distribution(lingo_ring_sizes)

# Print the distributions of ring sizes for each model
print("Distribución de tamaños de anillos para el modelo DrugGPT:")
print(drug_size_distribution)
print("\nDistribución de tamaños de anillos para el modelo Diff:")
print(diff_size_distribution)
print("\nDistribución de tamaños de anillos para el modelo RGA:")
print(rga_size_distribution)
print("\nDistribución de tamaños de anillos para el modelo Pocket2Mol:")
print(pocket_size_distribution)
print("\nDistribución de tamaños de anillos para el modelo Lingo:")
print(lingo_size_distribution)



Distribución de tamaños de anillos para el modelo DrugGPT:
3     0.002679
4     0.002679
5     0.257200
6     0.726055
7     0.009377
8     0.000670
18    0.001340
Name: proportion, dtype: float64

Distribución de tamaños de anillos para el modelo Diff:
3     0.367089
4     0.052743
5     0.219409
6     0.303797
7     0.042194
8     0.008439
9     0.004219
12    0.002110
Name: proportion, dtype: float64

Distribución de tamaños de anillos para el modelo RGA:
3    0.013615
4    0.004084
5    0.314500
6    0.667801
Name: proportion, dtype: float64

Distribución de tamaños de anillos para el modelo Pocket2Mol:
3     0.000473
5     0.239583
6     0.722064
7     0.023674
8     0.003314
9     0.000947
11    0.000473
12    0.000473
13    0.002367
14    0.001894
15    0.001420
16    0.002841
17    0.000473
Name: proportion, dtype: float64

Distribución de tamaños de anillos para el modelo Lingo:
4    0.002810
5    0.364403
6    0.631382
7    0.001405
Name: proportion, dtype: float64


**HBD, HBA**

In [None]:
from rdkit import Chem

# Function to calculate the number of hydrogen bond donors (HBDs) for each molecule
def calculate_hbd(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Chem.rdMolDescriptors.CalcNumHBD(mol)
    else:
        return None

# Function to calculate the number of hydrogen bond acceptors (HBAs) for each molecule
def calculate_hba(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Chem.rdMolDescriptors.CalcNumHBA(mol)
    else:
        return None

# Function to add the columns for HBA and HBD to the DataFrame
def add_hba_hbd_columns(df):
    hba_values = []
    hbd_values = []
    for smiles in df['SMILES']:
        hba = calculate_hba(smiles)
        hbd = calculate_hbd(smiles)
        hba_values.append(hba)
        hbd_values.append(hbd)
    df['HBA'] = hba_values
    df['HBD'] = hbd_values

# Add the columns for HBA and HBD to each DataFrame
add_hba_hbd_columns(drug_df)
add_hba_hbd_columns(diff_df)
add_hba_hbd_columns(rga_df)
add_hba_hbd_columns(pocket_df)
add_hba_hbd_columns(lingo_df)

# Print the DataFrames to verify the new columns have been added
print(drug_df.head())  # Print the first few rows of the DataFrame to verify the new columns
print(diff_df.head())  # Print the first few rows of the DataFrame to verify the new columns
print(rga_df.head())  # Print the first few rows of the DataFrame to verify the new columns
print(pocket_df.head())  # Print the first few rows of the DataFrame to verify the new columns
print(lingo_df.head())  # Print the first few rows of the DataFrame to verify the new columns


                                              SMILES  pChEMBL Predicted  \
0          Cc1nc2cc(ccc2o1)C(=O)Nc1nc(cs1)-c1ccccc1F           6.062517   
1         COc1cccc(NC(=O)c2cc(nc3ccccc23)-c2ccco2)c1           6.614742   
2    Cc1ccc(NC(=O)Cn2nc(C)cc2C)cc1S(=O)(=O)N1CCCCCC1           6.103859   
3  CCOc1ccc(cc1)C(=O)Nc1ccc2c(CC)nn(c2c1)S(=O)(=O...           6.188527   
4  CC(C)C(NC(=O)c1ccc(cc1)C#N)C(=O)OCC(=O)Nc1cccc...           6.631953   

        QED     logP  Docking Scores  Similarity  MolecularWeight     NR-AR  \
0  0.581966  4.65112            -9.5    0.275862          353.378  0.089856   
1  0.579807  4.75570            -9.1    0.333333          344.370  0.084144   
2  0.830188  3.01176            -8.7    0.250000          404.536  0.024204   
3  0.336028  5.03490            -9.7    0.260870          500.580  0.043299   
4  0.677599  3.14798            -8.0    0.263736          413.861  0.042349   

   NR-AR-LBD    NR-AhR  ...  NR-PPAR-gamma    SR-ARE  SR-ATAD5    SR-HSE  

In [None]:
drug_df.to_csv('all_predictions_drug_tox_docking.csv', index=False)
diff_df.to_csv('all_predictions_diff_tox_docking.csv', index=False)
rga_df.to_csv('all_predictions_rga_tox_docking_sas_updated.csv', index=False)
pocket_df.to_csv('all_predictions_pocket_tox_docking.csv', index=False)
lingo_df.to_csv('600_predictions_lingo_tox_docking.csv', index=False)
