## Descriptor Calculation and Dataset Preparation

In this notebook, I am calculating molecular descriptors that are essentially quantitative description of the compounds in the dataset. I  will also be preparing this into a dataset for subsequent model building.

- to calculate molecular descriptors

## Load bioactivity data

In [11]:
import pandas as pd

# Load the dataset for regression modeling
df3 = pd.read_csv('bioactivity_data_3class_pIC50.csv')
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,intermediate,281.271,1.89262,0.0,5.0,5.142668
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,intermediate,415.589,3.81320,0.0,2.0,5.026872
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,inactive,421.190,2.66050,0.0,4.0,4.869666
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,inactive,293.347,3.63080,0.0,3.0,4.882397
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],intermediate,338.344,3.53900,0.0,5.0,5.698970
...,...,...,...,...,...,...,...,...
240,CHEMBL4590273,Cc1cccc2nc(CSC(=S)NCc3cccnc3)cn12,active,328.466,3.34562,1.0,5.0,6.419999
241,CHEMBL5436771,S=C([S-])NCc1cccnc1.[K+],active,222.379,-1.99300,1.0,3.0,6.782516
242,CHEMBL2365410,CC(C)C[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@H](CC1C...,active,485.559,0.54470,5.0,7.0,6.793174
243,CHEMBL5436771,S=C([S-])NCc1cccnc1.[K+],active,222.379,-1.99300,1.0,3.0,6.779997


In [12]:

selection = ['canonical_smiles','molecule_chembl_id']
df3_selection = df3[selection]
df3_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)

In [13]:

! cat molecule.smi | head -5

Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21	CHEMBL187579
O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21	CHEMBL188487
O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21	CHEMBL185698
O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21	CHEMBL426082
O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-]	CHEMBL187717


In [14]:
! cat molecule.smi | wc -l

     245


## Calculate fingerprint descriptors

### Calculate with rdkit

Generate Morgan Fingerprints

In [17]:
from rdkit import Chem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
import numpy as np

# Initialize Morgan fingerprint generator
gen = GetMorganGenerator(radius=2, fpSize=2048)

fps = []
mols = []
ids = []

# Use the correct variable name here:
for idx, row in df3_selection.iterrows():
    mol = Chem.MolFromSmiles(row['canonical_smiles'])
    if mol is not None:
        fp = gen.GetFingerprint(mol)
        fpBits = list(map(int, fp.ToBitString()))
        fps.append(fpBits)
        mols.append(row['canonical_smiles'])
        ids.append(row['molecule_chembl_id'])

# Build dataframe of fingerprints
fp_df = pd.DataFrame(fps)
fp_df.insert(0, "molecule_chembl_id", ids)
fp_df.insert(0, "canonical_smiles", mols)

Merge with bioactivity

In [21]:
# Merge fingerprints with pIC50 values (join on molecule_chembl_id)
final_df = fp_df.merge(df3[['molecule_chembl_id', 'pIC50']], on='molecule_chembl_id')

# Check result
print(final_df.head())


                                  canonical_smiles molecule_chembl_id  0  1  \
0           Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21       CHEMBL187579  0  0   
1           O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21       CHEMBL188487  0  0   
2          O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21       CHEMBL185698  0  0   
3              O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21       CHEMBL426082  0  0   
4  O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-]       CHEMBL187717  0  0   

   2  3  4  5  6  7  ...  2039  2040  2041  2042  2043  2044  2045  2046  \
0  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0   
1  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0   
2  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0   
3  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0   
4  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0   

   2047     pIC50  
0     0  5.142668  
1     0  5.026872  
2     0 

In [22]:
# Save to CSV
final_df.to_csv('bioactivity_data_3class_pIC50_morgan_fp.csv', index=False)
