In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Download data

In [2]:
from tdc.single_pred import Tox

try:
    data = Tox(name='hERG_Karim')
    split = data.get_split()
    print("Data loaded and split successfully.")
    print("Train data samples:", len(split['train']))
    print("Validation data samples:", len(split['valid']))
    print("Test data samples:", len(split['test']))
except Exception as e:
    print("An error occurred:", str(e))

Found local copy...
Loading...
Done!


Data loaded and split successfully.
Train data samples: 9412
Validation data samples: 1344
Test data samples: 2689


# check data

In [15]:
split.keys()

dict_keys(['train', 'valid', 'test'])

In [3]:
split['train']

Unnamed: 0,Drug_ID,Drug,Y
0,1,COc1cc(N2Cc3ccc(Sc4ccc(F)cc4)nc3C2=O)ccc1OCCN1...,0
1,2,CCOC(=O)[C@H]1CC[C@@H](N2CC(NC(=O)CNc3nn(C(N)=...,0
2,3,N[C@@H](Cn1c(=O)cnc2ccc(F)cc21)C1CCC(NCc2ccc3c...,0
3,4,O=C(NC1COc2cccc(-c3ccnc(CO)c3)c2C1)c1ccc(OCC(F...,0
4,5,Cc1cc(Nc2cncc(N[C@@H](C)c3ncc(F)cn3)n2)n[nH]1,0
...,...,...,...
9407,13437,O=c1cc(-c2ccccc2)oc2cc(O)c(O)c(O)c12,0
9408,13438,CN(C)c1ccc2cc(C(=O)N[C@@H](CCCNC(=N)CCl)C(=O)N...,0
9409,13439,CCN(CC)CCCCNc1ncc2c(n1)N(C)C(=O)N(c1c(Cl)c(OC)...,0
9410,13441,Cc1cccc(-c2n[nH]cc2-c2ccc3ncccc3n2)n1,0


In [4]:
split['test']

Unnamed: 0,Drug_ID,Drug,Y
0,10325,COc1nccnc1CC1=C(CCN(C)C)Cc2cc(Cl)ccc21,1
1,11864,O=C(NC1CCN(Cc2ccn(-c3ccc(C(F)(F)F)cc3)c2)CC1)N...,0
2,5893,Fc1ccc(Cn2cc(NCCN3CCCCC3)nn2)cc1F,0
3,6664,CC1(C)C[C@@H](NC(=O)CC(O)(C(F)(F)F)C(F)(F)F)c2...,0
4,977,O=C(O)C[C@@H]1c2ccccc2C[C@H]1NC(=O)c1cc2sc(Cl)...,0
...,...,...,...
2684,6422,O=c1cc([C@H]2CCN[C@@H](Cc3ccccc3)C2)o[nH]1,0
2685,11883,C[C@@H]1CN(C(=O)c2ccccc2)CCN1C(=O)C(=O)c1c[nH]...,0
2686,7349,O=C(Nc1ccc(-c2nnn[nH]2)cc1F)C(C1CCCCC1)n1c(-c2...,0
2687,8836,C[C@]1(CS(=O)(=O)N2CCN(c3ncc(OCc4ccc(C(F)(F)F)...,0


In [5]:
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
import pandas as pd


# Function to convert SMILES to Morgan fingerprints
def smiles_to_fp(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    return list(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits))


# Prepare the data
def prepare_data(df):
    df['features'] = df['Drug'].apply(lambda x: smiles_to_fp(x))
    X = list(df['features'])
    y = df['Y'].values
    return X, y


# Load data
train_data = split['train'] 
X_train, y_train = prepare_data(train_data)


# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


# Evaluate the model
test_data = split['train'] 
X_test, y_test = prepare_data(test_data)
y_pred = model.predict(X_test)


In [6]:
len(X_train[0])

1024

# Evaluation

In [7]:
from evaluate import eval
eval(y_test, y_pred)

Accuracy: 0.9963875903102423
ROC AUC Score: 0.9963886635698163
Precision: 0.9970263381478335
Recall: 0.9957573186253712
F1 Score: 0.9963914243260454
Matthews Correlation Coefficient: 0.992775982302686
Specificity (Negative Prediction Accuracy): 0.9970200085142614


# Logic Tensor Network model

In [8]:
import torch

# Example of converting lists to Tensors
# Assuming 'X_train' and 'y_train' are your features and labels respectively and are initially lists

X_train = torch.tensor(X_train, dtype=torch.float32)  # Convert features to a float Tensor
y_train = torch.tensor(y_train, dtype=torch.float32)  # Convert labels to a float Tensor

# Ensure labels y_train is the right shape (e.g., for BCELoss, you might need to ensure it's two-dimensional if there are two outputs)
y_train = y_train.unsqueeze(1)  # Only do this if necessary

 # LTN: Define the Network

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(1024, 64),  # Assuming input features size of 1024
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1),  # binary classification
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)


# LTN: Define loss with Logical Constraint

In [10]:
import torch.nn.functional as F

def custom_logic_loss(outputs, targets):
    criterion = nn.BCELoss()
    classical_loss = criterion(outputs.squeeze(), targets.squeeze())
    
    # You can still implement a custom logic penalty if needed
    # For example, adding any additional logic-based constraints
    
    return classical_loss  # + any logic-based penalties


"""
def custom_logic_loss(outputs, targets):
    criterion = nn.BCELoss()
    # Assuming outputs and targets are already tensors and appropriately shaped
    classical_loss = criterion(outputs, targets)

    # If there are two logical outputs being evaluated:
    if outputs.shape[1] > 1:
        # Implementing the logical constraint:
        # If outputs[:,0] > 0.9 then outputs[:,1] < 0.1
        logic_condition = outputs[:, 0] > 0.9
        logic_penalty = torch.where(logic_condition, 1.0 - outputs[:, 1], torch.tensor(0.0).to(outputs.device))
        logic_penalty = logic_penalty.mean()  # Mean penalty per batch
    else:
        logic_penalty = torch.tensor(0.0).to(outputs.device)  # No penalty if not applicable

    return classical_loss + logic_penalty
"""    


'\ndef custom_logic_loss(outputs, targets):\n    criterion = nn.BCELoss()\n    # Assuming outputs and targets are already tensors and appropriately shaped\n    classical_loss = criterion(outputs, targets)\n\n    # If there are two logical outputs being evaluated:\n    if outputs.shape[1] > 1:\n        # Implementing the logical constraint:\n        # If outputs[:,0] > 0.9 then outputs[:,1] < 0.1\n        logic_condition = outputs[:, 0] > 0.9\n        logic_penalty = torch.where(logic_condition, 1.0 - outputs[:, 1], torch.tensor(0.0).to(outputs.device))\n        logic_penalty = logic_penalty.mean()  # Mean penalty per batch\n    else:\n        logic_penalty = torch.tensor(0.0).to(outputs.device)  # No penalty if not applicable\n\n    return classical_loss + logic_penalty\n'

# LTN: Training

In [11]:
# Assuming X_train and y_train are your datasets loaded as Tensor objects
model = SimpleNN()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Training loop
epoch_n = 200
model.train()
for epoch in range(epoch_n):
    optimizer.zero_grad()
    outputs = model(X_train)  # Make sure X_train is a tensor
    loss = custom_logic_loss(outputs, y_train)  # Ensure y_train is appropriately shaped
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
    

Epoch 1, Loss: 0.6954758763313293
Epoch 2, Loss: 0.6912357807159424
Epoch 3, Loss: 0.6874720454216003
Epoch 4, Loss: 0.6836674213409424
Epoch 5, Loss: 0.6794655323028564
Epoch 6, Loss: 0.6747002005577087
Epoch 7, Loss: 0.669296383857727
Epoch 8, Loss: 0.6632663011550903
Epoch 9, Loss: 0.6566621661186218
Epoch 10, Loss: 0.6495003700256348
Epoch 11, Loss: 0.6418114900588989
Epoch 12, Loss: 0.6336370706558228
Epoch 13, Loss: 0.6250107288360596
Epoch 14, Loss: 0.6160174012184143
Epoch 15, Loss: 0.6067817211151123
Epoch 16, Loss: 0.5974081158638
Epoch 17, Loss: 0.5880060791969299
Epoch 18, Loss: 0.5786936283111572
Epoch 19, Loss: 0.5695618987083435
Epoch 20, Loss: 0.5607056617736816
Epoch 21, Loss: 0.552213191986084
Epoch 22, Loss: 0.5441372990608215
Epoch 23, Loss: 0.5365136861801147
Epoch 24, Loss: 0.5293314456939697
Epoch 25, Loss: 0.5225563645362854
Epoch 26, Loss: 0.5161387324333191
Epoch 27, Loss: 0.5100336670875549
Epoch 28, Loss: 0.5041930079460144
Epoch 29, Loss: 0.4985920786857605

# LTN: Testing

In [12]:
# Assuming X_test and y_test are already loaded and are numpy arrays or lists
X_test = torch.tensor(X_test, dtype=torch.float32)  # Convert to tensor
# Ensure y_test is also a tensor if you have it and will evaluate metrics
y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)  # Convert to tensor and ensure correct shape


In [13]:
# Set model to evaluation mode
model.eval()

# Disable gradient computation for testing (saves memory and computations)
with torch.no_grad():
    outputs = model(X_test)


In [14]:
# Convert outputs to predicted classes
# For binary classification with a single output unit
predictions = (outputs > 0.5).float()  # Threshold probabilities to classify as 1 or 0

# Calculate accuracy or other metrics
accuracy = (predictions == y_test).float().mean()
print("Accuracy:", accuracy.item())

# If you need to calculate other metrics such as precision, recall, and F1 score
from sklearn.metrics import precision_score, recall_score, f1_score

# Since you cannot use sklearn directly with tensors, you need to move data back to CPU and convert to numpy
predictions = predictions.cpu().numpy()
y_test = y_test.cpu().numpy()

eval(y_test, predictions)

Accuracy: 0.9956438541412354
Accuracy: 0.9956438589035274
ROC AUC Score: 0.9956451106303099
Precision: 0.9963883577650308
Recall: 0.9949087823504454
F1 Score: 0.9956480203800021
Matthews Correlation Coefficient: 0.9912888112971575
Specificity (Negative Prediction Accuracy): 0.9963814389101745
