## Obtención de descriptores 
(Javier y Antonio)

Ya que no se aclaran los descriptores que se deben obtener se usa la biblioteca de RDKit y mordred para obtener descriptores 1D y 2D de los compuestos.

In [28]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, MolSurf, Fragments
import warnings

# Importar correctamente desde mordred
from mordred import Calculator, descriptors

warnings.filterwarnings("ignore")

In [29]:
# Crear calculadora solo con esos descriptores (1D/2D)
calc_mordred = Calculator(descriptors, ignore_3D=True)

In [30]:
# Lista de fragmentos de RDKit (no están en Mordred)
FRAGMENT_FUNCTIONS = [
    name for name in dir(Fragments)
    if name.startswith('fr_') and callable(getattr(Fragments, name))
]

In [31]:
def calcular_descriptores_1d_2d(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    descriptores = {}

    # --------------------------------------------------
    # Mordred: TODOS los descriptores 1D/2D
    # --------------------------------------------------
    try:
        mordred_result = calc_mordred(mol)
        for key, val in mordred_result.items():
            key_str = str(key)
            if isinstance(val, (int, float)) and not (isinstance(val, float) and np.isnan(val)):
                descriptores[key_str] = float(val)
            else:
                descriptores[key_str] = np.nan
    except Exception:
        # Si falla, llenar con NaN todos los nombres de Mordred
        for d in calc_mordred.descriptors:
            descriptores[str(d)] = np.nan

    # --------------------------------------------------
    # RDKi
    # --------------------------------------------------

    # 1. LabuteASA (no está en Mordred)
    descriptores["LabuteASA"] = MolSurf.LabuteASA(mol)

    # 2. Fragmentos funcionales (muy útiles, no en Mordred)
    for name in FRAGMENT_FUNCTIONS:
        func = getattr(Fragments, name)
        descriptores[name] = func(mol)

    # 3. Conteos personalizados (Mordred no da desglose por elemento)
    atomic_nums = [a.GetAtomicNum() for a in mol.GetAtoms()]
    descriptores["NumAtomosCarbono"] = atomic_nums.count(6)
    descriptores["NumAtomosOxigeno"] = atomic_nums.count(8)
    descriptores["NumAtomosNitrogeno"] = atomic_nums.count(7)
    descriptores["NumAtomosAzufre"] = atomic_nums.count(16)
    descriptores["NumAtomosHidrogeno"] = atomic_nums.count(1)
    descriptores["NumAtomosHalogenos"] = sum(1 for n in atomic_nums if n in [9, 17, 35, 53])

    # 4. Fracciones atómicas (no están directamente en Mordred)
    total = mol.GetNumAtoms()
    if total > 0:
        descriptores["FraccionC"] = descriptores["NumAtomosCarbono"] / total
        descriptores["FraccionO"] = descriptores["NumAtomosOxigeno"] / total
        descriptores["FraccionN"] = descriptores["NumAtomosNitrogeno"] / total
        descriptores["FraccionS"] = descriptores["NumAtomosAzufre"] / total
        descriptores["FraccionH"] = descriptores["NumAtomosHidrogeno"] / total
        descriptores["FraccionHalogenos"] = descriptores["NumAtomosHalogenos"] / total
    else:
        for k in ["FraccionC", "FraccionO", "FraccionN", "FraccionS", "FraccionH", "FraccionHalogenos"]:
            descriptores[k] = 0.0

    return descriptores

In [32]:
# Procesar el dataset
df = pd.read_csv('db/smilesdf.csv')

print(f"Procesando {len(df)} moléculas...")

# Calcular descriptores para cada molécula
descriptores_lista = []
smiles_validos = []
ids_validos = []
ic50_means = []
ic50_stds = []

for idx, row in df.iterrows():
    smiles = row['SMILES']
    descriptores = calcular_descriptores_1d_2d(smiles)
    
    if descriptores is not None:
        descriptores_lista.append(descriptores)
        smiles_validos.append(smiles)
        ids_validos.append(row['id'])
        ic50_means.append(row['IC50_mean'])
        ic50_stds.append(row['IC50_std'])
    else:
        print(f"SMILES inválido: {smiles}")

# Crear DataFrame con los descriptores
if descriptores_lista:
    df_descriptores = pd.DataFrame(descriptores_lista)
    df_descriptores.insert(0, 'SMILES', smiles_validos)
    df_descriptores.insert(0, 'id', ids_validos)
    df_descriptores['IC50_mean'] = ic50_means
    df_descriptores['IC50_std'] = ic50_stds
    
    df_descriptores = df_descriptores.dropna(axis=1)
    # Guardar resultados
    archivo_salida = 'db/smile_descriptor.csv'
    df_descriptores.to_csv(archivo_salida, index=False)
else:
    print("❌ No se pudieron calcular descriptores")

Procesando 40 moléculas...


In [33]:
df_descriptores.head()

Unnamed: 0,id,SMILES,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,NumAtomosHidrogeno,NumAtomosHalogenos,FraccionC,FraccionO,FraccionN,FraccionS,FraccionH,FraccionHalogenos,IC50_mean,IC50_std
0,3,c1c(oc(c1)c1sc(nn1)NC)[N+](=O)[O-],0.0,0.0,18.963222,2.385509,4.540223,18.963222,1.264215,3.637632,...,0,0,0.466667,0.2,0.266667,0.066667,0.0,0.0,54.0,0.17
1,4,c1c(oc(c1)c1sc(nn1)NCC)[N+](=O)[O-],0.0,0.0,20.133821,2.387226,4.543521,20.133821,1.258364,3.695893,...,0,0,0.5,0.1875,0.25,0.0625,0.0,0.0,50.0,0.8
2,5,c1c(oc(c1)c1sc(nn1)N1CC1)[N+](=O)[O-],0.0,0.0,20.985121,2.438105,4.595072,20.985121,1.31157,3.770283,...,0,0,0.5,0.1875,0.25,0.0625,0.0,0.0,55.5,0.66
3,6,c1c(oc(c1)c1sc(nn1)NCCO)[N+](=O)[O-],0.0,0.0,21.483789,2.38773,4.544779,21.483789,1.263752,3.750941,...,0,0,0.470588,0.235294,0.235294,0.058824,0.0,0.0,21.0,0.65
4,7,c1c(oc(c1)c1sc(nn1)NCCCO)[N+](=O)[O-],0.0,0.0,22.691729,2.387877,4.545276,22.691729,1.260652,3.803116,...,0,0,0.5,0.222222,0.222222,0.055556,0.0,0.0,18.0,0.2
