In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
orig_training_data = pd.read_csv("training_smiles.csv")

In [3]:
orig_training_data.shape

(202895, 3)

In [4]:
orig_training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202895 entries, 0 to 202894
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   INDEX   202895 non-null  int64  
 1   SMILES  202895 non-null  object 
 2   ACTIVE  202895 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 4.6+ MB


# OK, Let's try to see what features we can get out of the dataset

In [5]:
features_df = orig_training_data.copy()

In [6]:
from rdkit import Chem


features_df["SMILES"] = features_df["SMILES"].astype("string")
features_df["MolFromSmiles"] = features_df["SMILES"].apply(Chem.MolFromSmiles)



In [7]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202895 entries, 0 to 202894
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   INDEX          202895 non-null  int64  
 1   SMILES         202895 non-null  string 
 2   ACTIVE         202895 non-null  float64
 3   MolFromSmiles  202895 non-null  object 
dtypes: float64(1), int64(1), object(1), string(1)
memory usage: 6.2+ MB


In [8]:
features_df[:5]

Unnamed: 0,INDEX,SMILES,ACTIVE,MolFromSmiles
0,1,O=C(Nc1ccc2c(c1)OCCO2)C1CCN(c2ncccn2)CC1,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc3b220>
1,2,COCCCN1C(=O)C2C(C(=O)Nc3cccc(Cl)c3)C3C=CC2(O3)...,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc38b30>
2,3,CCSc1ncc(Cl)c(C(=O)Nc2ccccc2C)n1,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc3b0d0>
3,4,COc1ccc2cc(/C=N/NC(=O)CN(c3ccccc3C)S(=O)(=O)c3...,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc3b140>
4,5,CCCC(=O)Nc1nc2ccc(NC(=O)c3c(F)c(F)c(OC)c(F)c3F...,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc3b1b0>


## General features from MolFromSmiles

In [None]:
features_df["NumAtoms"] = features_df["MolFromSmiles"].apply(lambda m: m.GetNumAtoms() if m is not None else None)
# same func found in lipnski
# features_df["NumHeavyAtoms"] = features_df["MolFromSmiles"].apply(lambda m: m.GetNumHeavyAtoms() if m is not None else None)
features_df["NumBonds"] = features_df["MolFromSmiles"].apply(lambda m: m.GetNumBonds() if m is not None else None)
features_df.head()

Unnamed: 0,INDEX,SMILES,ACTIVE,MolFromSmiles,NumAtoms,NumHeavyAtoms,NumBonds
0,1,O=C(Nc1ccc2c(c1)OCCO2)C1CCN(c2ncccn2)CC1,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc3b220>,25,25,28
1,2,COCCCN1C(=O)C2C(C(=O)Nc3cccc(Cl)c3)C3C=CC2(O3)...,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc38b30>,35,35,39
2,3,CCSc1ncc(Cl)c(C(=O)Nc2ccccc2C)n1,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc3b0d0>,20,20,21
3,4,COc1ccc2cc(/C=N/NC(=O)CN(c3ccccc3C)S(=O)(=O)c3...,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc3b140>,36,36,39
4,5,CCCC(=O)Nc1nc2ccc(NC(=O)c3c(F)c(F)c(OC)c(F)c3F...,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc3b1b0>,30,30,32


In [10]:
# only 6... not significant?
len(features_df.query('NumAtoms != NumHeavyAtoms'))

6

# Fragments lib features

In [11]:
from rdkit.Chem import Fragments as f

frag_methods = [m for m in dir(f) if m.startswith("fr_")]

for frag in frag_methods:
    func = getattr(f, frag)
    features_df[frag] = features_df["MolFromSmiles"].apply(
        lambda m, fn=func: fn(m) if m is not None else None
    )


# lipinski lib features

In [None]:
from rdkit.Chem import Lipinski as l
test_mol = next(m for m in features_df["MolFromSmiles"] if m is not None)

numeric_lipinski = []
for name in dir(l):
    attr = getattr(l, name)
    if callable(attr):
        try:
            result = attr(test_mol)
            if isinstance(result, (int, float)):
                numeric_lipinski.append(name)
        except:
            pass 
                

print("Numeric Lipinski functions:", numeric_lipinski)

for func_name in numeric_lipinski:
    fn = getattr(l, func_name)
    features_df[func_name] = features_df["MolFromSmiles"].apply(
        lambda m, fn=fn: fn(m) if m is not None else None
    )


Numeric Lipinski functions: ['FractionCSP3', 'HeavyAtomCount', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAmideBonds', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumAromaticRings', 'NumAtomStereoCenters', 'NumBridgeheadAtoms', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumHeterocycles', 'NumRotatableBonds', 'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'NumSpiroAtoms', 'NumUnspecifiedAtomStereoCenters', 'Phi', 'RingCount', '_cfn', '_fn']


# RDMolDescriptions lib features

In [18]:
import rdkit.Chem.rdMolDescriptors as d

test_mol = next(m for m in features_df["MolFromSmiles"] if m is not None)

numeric_rdmol= []
for name in dir(d):
    attr = getattr(d, name)
    if callable(attr):
        try:
            result = attr(test_mol)
            if isinstance(result, (int, float)):
                numeric_rdmol.append(name)
        except:
            pass 
                

print("Numeric RDMolDesc functions:", numeric_rdmol)

for func_name in numeric_rdmol:
    fn = getattr(d, func_name)
    features_df[func_name] = features_df["MolFromSmiles"].apply(
        lambda m, fn=fn: fn(m) if m is not None else None
    )



[10:27:23] 

****
Pre-condition Violation
molecule has no conformers
Violation occurred on line 234 in file /Users/runner/work/rdkit-pypi/rdkit-pypi/build/temp.macosx-11.0-arm64-cpython-312/rdkit/Code/GraphMol/Descriptors/AUTOCORR3D.cpp
Failed Expression: mol.getNumConformers() >= 1
****

[10:27:23] 

****
Pre-condition Violation
molecule has no conformers
Violation occurred on line 208 in file /Users/runner/work/rdkit-pypi/rdkit-pypi/build/temp.macosx-11.0-arm64-cpython-312/rdkit/Code/GraphMol/Descriptors/PMI.cpp
Failed Expression: mol.getNumConformers() >= 1
****

[10:27:23] 

****
Pre-condition Violation
molecule has no conformers
Violation occurred on line 40 in file /Users/runner/work/rdkit-pypi/rdkit-pypi/build/temp.macosx-11.0-arm64-cpython-312/rdkit/Code/GraphMol/Descriptors/CoulombMat.cpp
Failed Expression: mol.getNumConformers() >= 1
****

[10:27:23] 

****
Pre-condition Violation
molecule has no conformers
Violation occurred on line 235 in file /Users/runner/work/rdkit-pypi/

Numeric RDMolDesc functions: ['CalcChi0n', 'CalcChi0v', 'CalcChi1n', 'CalcChi1v', 'CalcChi2n', 'CalcChi2v', 'CalcChi3n', 'CalcChi3v', 'CalcChi4n', 'CalcChi4v', 'CalcExactMolWt', 'CalcFractionCSP3', 'CalcHallKierAlpha', 'CalcKappa1', 'CalcKappa2', 'CalcKappa3', 'CalcLabuteASA', 'CalcNumAliphaticCarbocycles', 'CalcNumAliphaticHeterocycles', 'CalcNumAliphaticRings', 'CalcNumAmideBonds', 'CalcNumAromaticCarbocycles', 'CalcNumAromaticHeterocycles', 'CalcNumAromaticRings', 'CalcNumAtomStereoCenters', 'CalcNumAtoms', 'CalcNumBridgeheadAtoms', 'CalcNumHBA', 'CalcNumHBD', 'CalcNumHeavyAtoms', 'CalcNumHeteroatoms', 'CalcNumHeterocycles', 'CalcNumLipinskiHBA', 'CalcNumLipinskiHBD', 'CalcNumRings', 'CalcNumRotatableBonds', 'CalcNumSaturatedCarbocycles', 'CalcNumSaturatedHeterocycles', 'CalcNumSaturatedRings', 'CalcNumSpiroAtoms', 'CalcNumUnspecifiedAtomStereoCenters', 'CalcPhi', 'CalcTPSA', '_CalcMolWt']


In [19]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202895 entries, 0 to 202894
Columns: 163 entries, INDEX to _CalcMolWt
dtypes: float64(25), int64(136), object(1), string(1)
memory usage: 252.3+ MB


In [21]:
features_df.to_csv("training_features.csv")

In [20]:
features_df.head()

Unnamed: 0,INDEX,SMILES,ACTIVE,MolFromSmiles,NumAtoms,NumHeavyAtoms,NumBonds,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,...,CalcNumRings,CalcNumRotatableBonds,CalcNumSaturatedCarbocycles,CalcNumSaturatedHeterocycles,CalcNumSaturatedRings,CalcNumSpiroAtoms,CalcNumUnspecifiedAtomStereoCenters,CalcPhi,CalcTPSA,_CalcMolWt
0,1,O=C(Nc1ccc2c(c1)OCCO2)C1CCN(c2ncccn2)CC1,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc3b220>,25,25,28,0,0,0,...,4,3,0,1,1,0,0,4.368063,76.58,340.383
1,2,COCCCN1C(=O)C2C(C(=O)Nc3cccc(Cl)c3)C3C=CC2(O3)...,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc38b30>,35,35,39,0,0,0,...,5,8,1,2,3,1,5,6.877647,96.97,502.011
2,3,CCSc1ncc(Cl)c(C(=O)Nc2ccccc2C)n1,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc3b0d0>,20,20,21,0,0,0,...,2,4,0,0,0,0,0,4.977937,54.88,307.806
3,4,COc1ccc2cc(/C=N/NC(=O)CN(c3ccccc3C)S(=O)(=O)c3...,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc3b140>,36,36,39,0,0,0,...,4,8,0,0,0,0,0,7.516779,100.96,523.014
4,5,CCCC(=O)Nc1nc2ccc(NC(=O)c3c(F)c(F)c(OC)c(F)c3F...,0.0,<rdkit.Chem.rdchem.Mol object at 0x12dc3b1b0>,30,30,32,0,0,0,...,3,6,0,0,0,0,0,6.202841,80.32,441.406
