In [1]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

In [2]:
# select cpu
torch.cuda.is_available = lambda : False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device: ", device)
if device.type == "cuba":
    print(torch.cuba.get_device_name(0))
    
# version
print(f"Torch version: {torch.__version__}")

Using device:  cpu
Torch version: 1.12.1+cu102


In [2]:
# data
df = pd.read_csv('../../data/her2_molecules.csv')
df.tail(2)

Unnamed: 0,name,smiles,IC50,units,pIC50,class
2573,CHEMBL433520,CCN(CC)CC(O)CNc1cc2c(Nc3cccc(Br)c3)ncnc2cn1,1100000.0,nM,2.958607,low activity
2574,CHEMBL477,Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,1600000.0,nM,2.79588,low activity


In [4]:
# molecular fingerprints calc
from rdkit import Chem
from rdkit.Chem import AllChem

def fingerprints(smiles, radius=2, nbits=2048):
    fps = []
    mols = [Chem.MolFromSmiles(x) for x in smiles]
    for m in mols:
        fp = np.array(AllChem.GetMorganFingerprintAsBitVect(m, radius=radius, nBits=nbits))
        fps.append(fp)
    return fps

In [5]:
fprints = fingerprints(df['smiles'])

In [6]:
# feature encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

#encode y variable to numerical variable
label_encoder.fit(df['class'])
y = label_encoder.transform(df['class'])

In [7]:
# features
X = np.array(fprints)
y = np.array(y)

In [8]:
# data split
from sklearn.model_selection import train_test_split
torch.manual_seed(1)
np.random.seed(1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.25) 

In [9]:
# standardize features
X_train_norm = (X_train - np.mean(X_train)) / np.std(X_train)
X_test_norm = (X_test - np.mean(X_train)) / np.std(X_train)

In [25]:
# Create tensors from arrays
X_train_norm = torch.from_numpy(X_train_norm).float()
y_train = torch.from_numpy(y_train)

X_test_norm = torch.from_numpy(X_test_norm).float()
y_test = torch.from_numpy(y_test)

In [26]:
# pytorch dataset and dataloader
from torch.utils.data import TensorDataset, DataLoader

train_ds = TensorDataset(X_train_norm, y_train)
torch.manual_seed(1)
batch_size = 10
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

In [41]:
# model building
import torch.nn as nn

hidden_units = [100, 50]
input_size = X_train.shape[1]
all_layers = []
for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit
all_layers.append(nn.Linear(hidden_units[-1], 3))   

In [42]:
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Linear(in_features=2048, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=50, bias=True)
  (3): ReLU()
  (4): Linear(in_features=50, out_features=3, bias=True)
)

In [43]:
# loss function
loss_fn = nn.CrossEntropyLoss()
# optimization
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [55]:
# model training
torch.manual_seed(1)
num_epochs = 50

for epoch in range(num_epochs):
    accuracy_hist_train = 0
    loss_hist_train = 0
    for x_batch, y_batch in train_dl:
        pred = model(x_batch)
        loss = loss_fn(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_hist_train += loss.item() * y_batch.size(0)
        is_correct = (torch.argmax(pred, dim=1) == y_batch).float()
        accuracy_hist_train += is_correct.sum()
    accuracy_hist_train /= len(train_dl.dataset)
    loss_hist_train /= len(train_dl.dataset)
    print(f"Epoch {epoch + 1} "
          f"Accuracy {accuracy_hist_train:.4f} "
          f"Loss {loss_hist_train:.4f} ")

Epoch 1 Accuracy 0.9907 Loss 0.0226 
Epoch 2 Accuracy 0.9902 Loss 0.0176 
Epoch 3 Accuracy 0.9922 Loss 0.0159 
Epoch 4 Accuracy 0.9927 Loss 0.0165 
Epoch 5 Accuracy 0.9907 Loss 0.0285 
Epoch 6 Accuracy 0.9917 Loss 0.0145 
Epoch 7 Accuracy 0.9917 Loss 0.0165 
Epoch 8 Accuracy 0.9938 Loss 0.0101 
Epoch 9 Accuracy 0.9948 Loss 0.0086 
Epoch 10 Accuracy 0.9917 Loss 0.0123 
Epoch 11 Accuracy 0.9938 Loss 0.0098 
Epoch 12 Accuracy 0.9943 Loss 0.0083 
Epoch 13 Accuracy 0.9922 Loss 0.0086 
Epoch 14 Accuracy 0.9943 Loss 0.0082 
Epoch 15 Accuracy 0.9943 Loss 0.0086 
Epoch 16 Accuracy 0.9933 Loss 0.0122 
Epoch 17 Accuracy 0.9803 Loss 0.0753 
Epoch 18 Accuracy 0.9855 Loss 0.0331 
Epoch 19 Accuracy 0.9860 Loss 0.0290 
Epoch 20 Accuracy 0.9907 Loss 0.0283 
Epoch 21 Accuracy 0.9896 Loss 0.0230 
Epoch 22 Accuracy 0.9917 Loss 0.0206 
Epoch 23 Accuracy 0.9886 Loss 0.0271 
Epoch 24 Accuracy 0.9907 Loss 0.0189 
Epoch 25 Accuracy 0.9902 Loss 0.0263 
Epoch 26 Accuracy 0.9855 Loss 0.0342 
Epoch 27 Accuracy 0.9

In [61]:
print(f"Average Loss: {np.mean(loss_hist_train):.4f} \
        \nTrain Accuracy: {100 * accuracy_hist_train.mean():.4f}%")

Average Loss: 0.0081 
Train Accuracy: 99.2750%


In [53]:
# model evaluation of test dataset
pred_test = model(X_test_norm)
correct = (torch.argmax(pred_test,dim=1) == y_test).float()
print(f"Test Accuracy: {100 * correct.mean():.4f}%")

Test Accuracy: 80.7453%
