In [1]:
import pandas as pd

df = pd.read_csv("../data/bioactml_data.csv")
print("loaded data:", df.shape)
df.head()

loaded data: (5169, 2)


Unnamed: 0,canonical_smiles,pIC50
0,CC(CN1CCCCC1)OC(=O)[C@@H]1CCCN1C(=O)C(=O)C(C)(C)C,5.0
1,CC(CN1CCCCC1)OC(=O)[C@@H]1CCCN1C(=O)C(=O)c1ccccc1,5.0
2,CC(C)CC(=O)C(=O)N1CCC[C@H]1C(=O)OCCCc1cccnc1,5.0
3,C=CC[C@@H]1/C=C(\C)C[C@H](C)C[C@H](OC)[C@H]2O[...,8.500038
4,CC(C)CC(=O)C(=O)N1CCC[C@H]1C(=O)OCCS(=O)(=O)c1...,5.0


In [7]:
from rdkit import Chem
from rdkit.Chem import AllChem

#convert SMILES to 1024-bit Morgan fingerprint (r=2)
def smiles_to_morgan(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
    return list(fp)

df["features"] = df["canonical_smiles"].apply(smiles_to_morgan)
df = df.dropna(subset=["features"])

print("Featurised compounds:", len(df))
df.head()



Featurised compounds: 5169




Unnamed: 0,canonical_smiles,pIC50,features
0,CC(CN1CCCCC1)OC(=O)[C@@H]1CCCN1C(=O)C(=O)C(C)(C)C,5.0,"[0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,CC(CN1CCCCC1)OC(=O)[C@@H]1CCCN1C(=O)C(=O)c1ccccc1,5.0,"[0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,CC(C)CC(=O)C(=O)N1CCC[C@H]1C(=O)OCCCc1cccnc1,5.0,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,C=CC[C@@H]1/C=C(\C)C[C@H](C)C[C@H](OC)[C@H]2O[...,8.500038,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,CC(C)CC(=O)C(=O)N1CCC[C@H]1C(=O)OCCS(=O)(=O)c1...,5.0,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [8]:
import numpy as np

#make NumPy array
X = np.array(df["features"].tolist())
y = df["pIC50"].values

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (5169, 1024)
y shape: (5169,)


In [9]:
import pickle

with open("../data/X.pkl", "wb") as f:
    pickle.dump(X, f)

with open("../data/y.pkl", "wb") as f:
    pickle.dump(y, f)

print("saved X.pkl and y.pkl to ../data/")

saved X.pkl and y.pkl to ../data/
