<a href="https://colab.research.google.com/github/rromerov/Proyecto_Integrador/blob/main/utils/molGeneratorEval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Clonar modelo de Hugging Face
! git clone https://huggingface.co/mrm8488/chEMBL_smiles_v1

Cloning into 'chEMBL_smiles_v1'...
remote: Enumerating objects: 49, done.[K
remote: Total 49 (delta 0), reused 0 (delta 0), pack-reused 49 (from 1)[K
Unpacking objects: 100% (49/49), 19.21 KiB | 1.13 MiB/s, done.
Filtering content: 100% (4/4), 275.08 MiB | 33.08 MiB/s, done.


In [18]:
# Instalar librerias
%%capture
! pip install datasets
! pip install transformers[torch]
! pip install accelerate -U
! pip install rdkit
! pip install rdkit-pypi

In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, RobertaForCausalLM, pipeline
from datasets import Dataset
import numpy as np
import torch
from rdkit import Chem
from google.colab import drive
from rdkit.Chem import rdchem
from random import sample
import re
import sys
import os

In [19]:
# Cargar Google Drive al notebook
drive.mount('/content/drive')

# Cargar el dataset
ruta_archivo_bioactivity = '/content/drive/My Drive/Colab Notebooks/data/bioactivity_data_2class_pIC50.csv'

# Lee el archivo CSV en un DataFrame
df = pd.read_csv(ruta_archivo_bioactivity)

# Mostrar que se haya importado correctamente el archivo
df.head(5)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL2022564,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,active,828.13,5.1124,2.0,10.0,9.49485
1,CHEMBL3234200,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,active,814.103,4.8664,2.0,10.0,9.522879
2,CHEMBL3234201,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,active,842.157,5.5025,2.0,10.0,9.327902
3,CHEMBL3234202,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,active,828.13,5.1124,2.0,10.0,9.920819
4,CHEMBL3649592,CC1CCC(NCc2ccn(Cc3ccccc3)c(=O)c2O)CC1,inactive,326.44,3.2705,2.0,4.0,4.562249


In [20]:
# Usar solo smiles con clase activa
df = df.loc[df['class'] == 'active']

In [21]:
# Mostrar dataframe
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL2022564,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,active,828.130,5.11240,2.0,10.0,9.494850
1,CHEMBL3234200,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,active,814.103,4.86640,2.0,10.0,9.522879
2,CHEMBL3234201,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,active,842.157,5.50250,2.0,10.0,9.327902
3,CHEMBL3234202,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,active,828.130,5.11240,2.0,10.0,9.920819
21,CHEMBL253969,NC(=O)c1c(OCc2c(F)cc(Br)cc2F)nsc1NC(=O)NCCCCN1...,active,532.411,3.85920,3.0,6.0,8.278189
...,...,...,...,...,...,...,...,...
10594,CHEMBL3335371,CC(=O)Nc1cc(Oc2ccc3c(c2)nc(Nc2cccc(C(C)(C)C)c2...,active,429.524,5.76010,2.0,6.0,8.698970
10595,CHEMBL470808,CNC(=O)c1cc(Oc2ccc3c(c2)nc(Nc2cccc(C(C)(C)C)c2...,active,429.524,5.16130,2.0,6.0,8.301030
10596,CHEMBL4282506,N#CCC(=O)N/N=C1\C(=O)Nc2ccc(S(=O)(=O)N3CCOCC3)...,active,377.382,-0.60642,2.0,7.0,6.559091
10631,CHEMBL4644274,COc1ccc(-c2cnn3c(N)c(-c4ccccc4)c(NC4CC(C)(C)N(...,active,514.674,5.72600,2.0,8.0,6.004365




---

#### Carga del modelo pre-entrenado con finetuning



In [22]:
# Verificar si CUDA está disponible y definir el dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device in use: {device}')

Device in use: cpu


In [23]:
# Cargar el modelo y el tokenizador entrenados
model_name = "/content/drive/My Drive/Colab Notebooks/data/results"  # Directorio donde se guardaron los resultados del entrenamiento
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaForCausalLM.from_pretrained(model_name, is_decoder=True)  # Asegúrate de configurar el modelo como decodificador

# Crear el pipeline de fill-mask usando el modelo afinado
fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)



---

#### Funciones para generación de smiles

In [24]:
# Función para validar SMILES
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

In [25]:
# Función para remover átomos y agregar *
def remove_atoms_and_placeholder(smiles, num_atoms_to_remove=1):
    # Redirigir stderr para silenciar errores de RDKit
    stderr = sys.stderr
    sys.stderr = open(os.devnull, 'w')

    try:
        # Convertir SMILES a una molécula RDKit
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None, None

        # Seleccionar átomos al azar para eliminar
        atoms_to_remove = sample(range(mol.GetNumAtoms()), num_atoms_to_remove)

        # Crear una copia editable de la molécula
        mol_edit = rdchem.RWMol(mol)

        # Remover átomos
        for idx in sorted(atoms_to_remove, reverse=True):
            mol_edit.ReplaceAtom(idx, rdchem.Atom('*'))

        # Convertir la molécula modificada de nuevo a SMILES
        modified_smiles = Chem.MolToSmiles(mol_edit)

    finally:
        # Restaurar stderr
        sys.stderr.close()
        sys.stderr = stderr

    return modified_smiles, atoms_to_remove

In [43]:
%%capture
# Ejemplo de múltiples secuencias SMILES con placeholders en diferentes posiciones
smiles_list = df['canonical_smiles']

# DataFrame para almacenar los resultados
results_list = []

# Generar secuencias completas para cada SMILES en la lista
for idx, smiles in enumerate(smiles_list):
    modified_smiles, removed_atoms = remove_atoms_and_placeholder(smiles, num_atoms_to_remove=1)
    if modified_smiles:
            modified_smiles_with_mask = modified_smiles.replace('*', '<mask>')

            # Usar el pipeline para llenar las máscaras
            results = fill_mask(modified_smiles_with_mask)

            # Procesar los resultados y almacenar en la lista de resultados
            for res in results:
                sequence = res['sequence']
                sequence = sequence.replace('<s>', '').replace('</s>', '').strip()
                if is_valid_smiles(sequence):
                    results_list.append({
                        'Original_SMILES': smiles,
                        'Modified_SMILES': sequence,
                        'Group': idx
                    })

# Crear el DataFrame a partir de la lista de resultados
results_df = pd.DataFrame(results_list)

# Filtrar filas donde el Original_SMILES sea igual al Modified_SMILES
results_df = results_df[results_df['Original_SMILES'] != results_df['Modified_SMILES']]

In [89]:
# Mostrar el DataFrame resultante
results_df = results_df.reset_index(drop=True)
results_df

Unnamed: 0,Original_SMILES,Modified_SMILES,Group
0,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,0
1,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,0
2,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,1
3,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,1
4,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,2
...,...,...,...
10784,CNC(=O)c1cc(Oc2ccc3c(c2)nc(Nc2cccc(C(C)(C)C)c2...,CNC(=O)c1c:ccc(Oc2ccc3c(c2)nc(Nc2cccc(C(C)(C)C...,6718
10785,N#CCC(=O)N/N=C1\C(=O)Nc2ccc(S(=O)(=O)N3CCOCC3)...,CS=S(=O)(c1ccc2c(c1)/C(=N/NC(=O)CC#N)C(=O)N2)N...,6719
10786,COc1ccc(-c2cnn3c(N)c(-c4ccccc4)c(NC4CC(C)(C)N(...,COc1ccc(-c2:c:nn3c(N)c(-c4ccccc4)c(NC4CC(C)(C)...,6720
10787,COc1ccc(-c2cnn3c(N)c(-c4ccccc4)c(NC4CC(C)(C)N(...,COc1ccc(-c2:n:nn3c(N)c(-c4ccccc4)c(NC4CC(C)(C)...,6720


In [90]:
# Seleccionar solo la columna 'molecule_chembl_id' del DataFrame original
df_subset = df[['molecule_chembl_id']]

# Añadir la columna 'molecule_chembl_id' al DataFrame results_df
merged_df = pd.merge(results_df, df_subset, left_on='Group', right_index=True)

In [91]:
# Mostrar el DataFrame resultante
merged_df = merged_df.reset_index(drop=True)
merged_df

Unnamed: 0,Original_SMILES,Modified_SMILES,Group,molecule_chembl_id
0,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,0,CHEMBL2022564
1,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,0,CHEMBL2022564
2,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,1,CHEMBL3234200
3,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,1,CHEMBL3234200
4,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,2,CHEMBL3234201
...,...,...,...,...
6689,CC(=O)NC(C)c1ccc(Nc2ncc(C(F)(F)F)c(CCc3ccccc3C...,CSC(=O)C1(c2ccccc2CCc2nc(Nc3ccc(C(C)NC(C)=O)cc...,6655,CHEMBL3917723
6690,COCCn1cc(Nc2ncc(Cl)c(CCc3ccccc3C3(C(N)=O)CC3)n...,COCOCn1cc(Nc2ncc(Cl)c(CCc3ccccc3C3(C(N)=O)CC3)...,6657,CHEMBL3947740
6691,COCCn1cc(Nc2ncc(Cl)c(CCc3ccccc3C3(C(N)=O)CC3)n...,COOCCCn1cc(Nc2ncc(Cl)c(CCc3ccccc3C3(C(N)=O)CC3...,6657,CHEMBL3947740
6692,Cc1cc(Cl)ccc1C(=O)NS(=O)(=O)c1ccc(Cl)cc1,Cc1cc(Cl)ccc1C(=O)NS(=O)(=O)c1:c:cc(Cl)cc1,6712,CHEMBL3918092


In [92]:
# Remover moléculas modificadas que sean iguales a las originales
merged_df = merged_df[merged_df['Original_SMILES'] != merged_df['Modified_SMILES']]
merged_df

Unnamed: 0,Original_SMILES,Modified_SMILES,Group,molecule_chembl_id
0,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,0,CHEMBL2022564
1,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,0,CHEMBL2022564
2,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,1,CHEMBL3234200
3,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,1,CHEMBL3234200
4,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,2,CHEMBL3234201
...,...,...,...,...
6689,CC(=O)NC(C)c1ccc(Nc2ncc(C(F)(F)F)c(CCc3ccccc3C...,CSC(=O)C1(c2ccccc2CCc2nc(Nc3ccc(C(C)NC(C)=O)cc...,6655,CHEMBL3917723
6690,COCCn1cc(Nc2ncc(Cl)c(CCc3ccccc3C3(C(N)=O)CC3)n...,COCOCn1cc(Nc2ncc(Cl)c(CCc3ccccc3C3(C(N)=O)CC3)...,6657,CHEMBL3947740
6691,COCCn1cc(Nc2ncc(Cl)c(CCc3ccccc3C3(C(N)=O)CC3)n...,COOCCCn1cc(Nc2ncc(Cl)c(CCc3ccccc3C3(C(N)=O)CC3...,6657,CHEMBL3947740
6692,Cc1cc(Cl)ccc1C(=O)NS(=O)(=O)c1ccc(Cl)cc1,Cc1cc(Cl)ccc1C(=O)NS(=O)(=O)c1:c:cc(Cl)cc1,6712,CHEMBL3918092


In [93]:
# Contar el número de moléculas válidas para cada molécula original
count_valid_molecules = merged_df.groupby('Group').size().reset_index(name='Count_Valid_Molecules')

# Eliminar la columna 'Modified_SMILES' del DataFrame merged_df
valid_mol_per_original_mol = merged_df.drop(columns=['Modified_SMILES'])

# Fusionar el conteo de moléculas válidas con el DataFrame merged_df
valid_mol_per_original_mol = pd.merge(valid_mol_per_original_mol, count_valid_molecules, on='Group', how='left')

# Eliminar filas duplicadas basadas en 'Original_SMILES'
valid_mol_per_original_mol = valid_mol_per_original_mol.drop_duplicates(subset=['Original_SMILES'])

# Mostrar el DataFrame resultante
valid_mol_per_original_mol = valid_mol_per_original_mol.reset_index(drop=True)
valid_mol_per_original_mol

Unnamed: 0,Original_SMILES,Group,molecule_chembl_id,Count_Valid_Molecules
0,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,0,CHEMBL2022564,2
1,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,1,CHEMBL3234200,2
2,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,2,CHEMBL3234201,4
3,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)O[C@H](C(...,3,CHEMBL3234202,1
4,COc1cc2c(Nc3c(F)cc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,21,CHEMBL253969,1
...,...,...,...,...
3341,CN1CCC(c2ccc(Nc3ncc(C(F)(F)F)c(CCc4ccccc4C4(C(...,6652,CHEMBL3939588,1
3342,CC(N)c1ccc(Nc2ncc(C(F)(F)F)c(CCc3ccccc3C3(C(N)...,6654,CHEMBL3904787,3
3343,CC(=O)NC(C)c1ccc(Nc2ncc(C(F)(F)F)c(CCc3ccccc3C...,6655,CHEMBL3917723,2
3344,COCCn1cc(Nc2ncc(Cl)c(CCc3ccccc3C3(C(N)=O)CC3)n...,6657,CHEMBL3947740,2
