In [25]:
pip install -q chembl_webresource_client pandas numpy scikit-learn torch rdkit-pypi

In [27]:
from chembl_webresource_client.new_client import new_client

target_query_df = pd.DataFrame.from_dict(new_client.target.search('EGFR'))
target_query_df.head()

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Mus musculus,Epidermal growth factor receptor erbB1,16.0,False,CHEMBL3608,"[{'accession': 'Q01279', 'component_descriptio...",SINGLE PROTEIN,10090
1,[],Homo sapiens,EGFR/PPP1CA,16.0,False,CHEMBL4523747,"[{'accession': 'P00533', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
2,[],Homo sapiens,VHL/EGFR,16.0,False,CHEMBL4523998,"[{'accession': 'P00533', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
3,[],Homo sapiens,CCN2-EGFR,16.0,False,CHEMBL5465557,"[{'accession': 'P00533', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
4,[],Homo sapiens,Epidermal growth factor receptor erbB1,12.0,False,CHEMBL203,"[{'accession': 'P00533', 'component_descriptio...",SINGLE PROTEIN,9606


In [28]:
#pick CHEMBL203
res = new_client.activity.filter(target_chembl_id='CHEMBL203').filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)

In [29]:
# Keep only rows with SMILES and numeric IC50
df = df[df['canonical_smiles'].notna()]
df = df[df['standard_value'].notna()]
df = df[df['standard_units'] == 'nM']
df = df[['canonical_smiles', 'standard_value']]

In [30]:
# Convert IC50 to pIC50
import numpy as np
df['standard_value'] = pd.to_numeric(df['standard_value'], errors='coerce')
df = df[df['standard_value'] > 0]
df['pIC50'] = -np.log10(df['standard_value'] * 1e-9)
df = df[['canonical_smiles', 'pIC50']]
df.head()

Unnamed: 0,canonical_smiles,pIC50
0,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,7.387216
1,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,6.522879
2,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,5.106793
3,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,6.769551
4,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,7.39794


In [31]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

In [32]:
def mol_to_fp(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits))
    else:
        return None

fps = []
labels = []

for _, row in df.iterrows():
    fp = mol_to_fp(row['canonical_smiles'])
    if fp is not None:
        fps.append(fp)
        labels.append(row['pIC50'])

X = np.array(fps)
y = np.array(labels)

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize input
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim

# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# Define model
class IC50NN(nn.Module):
    def __init__(self, input_dim):
        super(IC50NN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.out = nn.Linear(128, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.out(x)

model = IC50NN(input_dim=X_train.shape[1])

In [35]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 50
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

Epoch 10/50, Loss: 8.3220
Epoch 20/50, Loss: 2.6975
Epoch 30/50, Loss: 1.8963
Epoch 40/50, Loss: 0.9632
Epoch 50/50, Loss: 0.6481


In [36]:
model.eval()
with torch.no_grad():
    preds = model(X_test_tensor).squeeze()
    mse = ((preds - y_test_tensor.squeeze()) ** 2).mean().item()
    print(f"Test MSE: {mse:.4f}")

    from sklearn.metrics import r2_score
    r2 = r2_score(y_test_tensor.numpy(), preds.numpy())
    print(f"R² score: {r2:.4f}")

Test MSE: 1.1094
R² score: 0.4935
