<a href="https://colab.research.google.com/github/rromerov/Proyecto_Integrador/blob/main/Avance2/Avance2.12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.6


In [2]:
import pandas as pd
from google.colab import drive
from rdkit import Chem
from rdkit.Chem import AllChem
import os

In [3]:
# Cargar Google Drive al notebook
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/bioactivity_data_2class_pIC50.csv')

Mounted at /content/drive


In [4]:
df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL212560,CN(C)CCCOc1cccc2c(C#N)c(-c3ccccc3)c(NC3CCCCC3)n12,inactive,416.569,5.55308,1.0,5.0,4.102373
1,CHEMBL386641,CN(C)CCNc1cccc2c(C#N)c(-c3ccccc3)c(NC3CCCCC3)n12,inactive,401.558,5.19608,2.0,5.0,4.376751
2,CHEMBL425440,N#Cc1c(-c2ccccc2)c(Nc2cccc(N)c2)n2c(Cl)cccc12,inactive,358.832,5.45718,2.0,4.0,4.09691
3,CHEMBL436932,CN(C)CCCNc1c(-c2ccccc2)c(C#N)c2cccc(Cl)n12,inactive,352.869,4.49498,1.0,4.0,4.207608
4,CHEMBL213321,Cc1cccc(C)c1Nc1c(-c2ccccc2)c(C#N)c2cccc(Cl)n12,inactive,371.871,6.49182,1.0,3.0,4.221849


In [5]:
selection = ['canonical_smiles','molecule_chembl_id']
df_selection = df[selection]
df_selection.to_csv('/content/drive/My Drive/Colab Notebooks/data/molecule.smi', sep='\t', index=False, header=False)

In [6]:
! cat '/content/drive/My Drive/Colab Notebooks/data/molecule.smi' | head -5

CN(C)CCCOc1cccc2c(C#N)c(-c3ccccc3)c(NC3CCCCC3)n12	CHEMBL212560
CN(C)CCNc1cccc2c(C#N)c(-c3ccccc3)c(NC3CCCCC3)n12	CHEMBL386641
N#Cc1c(-c2ccccc2)c(Nc2cccc(N)c2)n2c(Cl)cccc12	CHEMBL425440
CN(C)CCCNc1c(-c2ccccc2)c(C#N)c2cccc(Cl)n12	CHEMBL436932
Cc1cccc(C)c1Nc1c(-c2ccccc2)c(C#N)c2cccc(Cl)n12	CHEMBL213321


In [7]:
! cat '/content/drive/My Drive/Colab Notebooks/data/molecule.smi' | wc -l

98


## Calcular descriptores

In [8]:
# Función para calcular los descriptores de huellas dactilares
def calculate_pubchem_fingerprints(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=881)  # Morgan fingerprint, radius=2, nBits=881
    return fp

# Leer el archivo molecule.smi y calcular los descriptores de huellas dactilares
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/data/molecule.smi", sep="\t", header=None, names=["SMILES", "ID"])
fingerprints = []
for smiles in data["SMILES"]:
    fp = calculate_pubchem_fingerprints(smiles)
    if fp is not None:
        # Convertir el objeto ExplicitBitVect a una lista de enteros
        arr = [int(x) for x in fp.ToBitString()]
        fingerprints.append(arr)

# Convertir los descriptores de huellas dactilares en un DataFrame de pandas
fingerprints_df = pd.DataFrame(fingerprints)
fingerprints_df.columns = [f"PubchemFP{i}" for i in range(len(fingerprints_df.columns))]
fingerprints_df.insert(0, "Name", data["ID"])

# Guardar los descriptores de huellas dactilares en un archivo CSV
fingerprints_df.to_csv('/content/drive/My Drive/Colab Notebooks/data/descriptors_output.csv', index=False)

# Eliminar las sales y normalizar los nitros utilizando RDKit
data["SMILES"] = data["SMILES"].apply(Chem.MolFromSmiles)
data["SMILES"] = data["SMILES"].apply(Chem.RemoveHs)
data["SMILES"] = data["SMILES"].apply(Chem.MolToSmiles)

## Preparar Matrices de datos de X y Y

### Matriz de datos X

In [9]:
df2_X = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/descriptors_output.csv')

In [10]:
df2_X

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL212560,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,CHEMBL386641,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,CHEMBL425440,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,CHEMBL436932,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
4,CHEMBL213321,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,CHEMBL4515173,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94,CHEMBL4646334,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95,CHEMBL4643884,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,CHEMBL4637483,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df2_X = df2_X.drop(columns=['Name'])
df2_X

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Variable Y

### Convertir IC50 a pIC50

In [12]:
df2_Y = df['pIC50']
df2_Y

0     4.102373
1     4.376751
2     4.096910
3     4.207608
4     4.221849
        ...   
93    4.300162
94    4.386158
95    4.718967
96    4.379864
97    4.468521
Name: pIC50, Length: 98, dtype: float64

In [13]:
dataset = pd.concat([df2_X,df2_Y], axis=1)
dataset

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,4.102373
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,4.376751
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,4.096910
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,4.207608
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,4.221849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.300162
94,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.386158
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.718967
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.379864


Guardar el dataset para posteriormente realizar el modelo de aprendizaje supervisado

In [14]:
dataset.to_csv('/content/drive/My Drive/Colab Notebooks/data/bioactivity_data_3class_pIC50_pubchem_fp.csv', index=False)

Guardar en un archivo .zip los archivos generados

In [15]:
# Guardar archivos en un archivo zip
! zip -r /content/drive/My\ Drive/Colab\ Notebooks/data/results.zip /content/drive/My\ Drive/Colab\ Notebooks/data


updating: content/drive/My Drive/Colab Notebooks/data/plot_bioactivity_class.pdf (deflated 37%)
updating: content/drive/My Drive/Colab Notebooks/data/plot_ic50.pdf (deflated 38%)
updating: content/drive/My Drive/Colab Notebooks/data/plot_LogP.pdf (deflated 38%)
updating: content/drive/My Drive/Colab Notebooks/data/plot_MW.pdf (deflated 38%)
updating: content/drive/My Drive/Colab Notebooks/data/plot_MW_vs_LogP.pdf (deflated 19%)
updating: content/drive/My Drive/Colab Notebooks/data/plot_NumHAcceptors.pdf (deflated 37%)
updating: content/drive/My Drive/Colab Notebooks/data/plot_NumHDonors.pdf (deflated 37%)
updating: content/drive/My Drive/Colab Notebooks/data/bioactivity_data_2class_pIC50.csv (deflated 77%)
updating: content/drive/My Drive/Colab Notebooks/data/bioactivity_data.csv (deflated 90%)
updating: content/drive/My Drive/Colab Notebooks/data/bioactivity_data_curated.csv (deflated 83%)
updating: content/drive/My Drive/Colab Notebooks/data/bioactivity_data_preprocessed.csv (deflate

In [16]:
# Verificar cambios
! ls '/content/drive/My Drive/Colab Notebooks/data/'

bioactivity_data_2class_pIC50.csv	      mannwhitneyu_pIC50.csv
bioactivity_data_3class_pIC50_pubchem_fp.csv  molecule.smi
bioactivity_data.csv			      plot_bioactivity_class.pdf
bioactivity_data_curated.csv		      plot_ic50.pdf
bioactivity_data_preprocessed.csv	      plot_LogP.pdf
descriptors_output.csv			      plot_MW.pdf
mannwhitneyu_LogP.csv			      plot_MW_vs_LogP.pdf
mannwhitneyu_MW.csv			      plot_NumHAcceptors.pdf
mannwhitneyu_NumHAcceptors.csv		      plot_NumHDonors.pdf
mannwhitneyu_NumHDonors.csv		      results.zip
