In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Download data

In [1]:
from tdc.single_pred import Tox

try:
    data = Tox(name='hERG_Karim')
    split = data.get_split("scaffold")
    print("Data loaded and split successfully.")
    print("Train data samples:", len(split['train']))
    print("Validation data samples:", len(split['valid']))
    print("Test data samples:", len(split['test']))
except Exception as e:
    print("An error occurred:", str(e))

Found local copy...
Loading...
Done!
100%|██████████| 13445/13445 [00:05<00:00, 2302.95it/s]

Data loaded and split successfully.
Train data samples: 9411
Validation data samples: 1344
Test data samples: 2690





# check data

In [3]:
split.keys()

dict_keys(['train', 'valid', 'test'])

In [4]:
split['train']

Unnamed: 0,Drug_ID,Drug,Y
0,8640,O=C1NCCN1CC[N+]1CCC(c2cn(C3CCCCC3)c3ccc(Cl)cc2...,0
1,11377,O=C(Cc1ccc(-n2cnnn2)cc1)N1CCN(CCc2ccc3nonc3c2)CC1,1
2,1461,NC(=O)c1ncc(N[C@@H]2CCCC[C@@H]2N)cc1Nc1cccc(C(...,1
3,6646,Cc1cc(C)nc(Nc2cc(N[C@@H]3CCCC[C@@H]3N)cnc2C(N)...,1
4,379,COc1cc(C)nc(Nc2cc(N[C@@H]3CCCC[C@@H]3N)cnc2C(N...,1
...,...,...,...
9406,5238,CC(C)(C)c1cc(NC(=O)n2ccc3cc(Oc4ncnc5c4CCNC5)cc...,1
9407,6201,Cc1ccc2c(C3CCN(CCc4c(C)ccc5c4ccc(=O)n5C)CC3)cc...,1
9408,11725,CCOC(=O)C1=C(CN2CCOCC2)NC(c2nccs2)=NC1c1ccc(F)...,1
9409,12714,CCCCCCOC(=O)NC(=N)c1ccc(NCc2nc3cc(C(=O)N(CCC(=...,1


In [5]:
split['test']

Unnamed: 0,Drug_ID,Drug,Y
0,7417,CC(C)c1noc(-c2nnc3n2CCN(C(=O)c2ccc(F)cc2)[C@@H...,1
1,3967,CCc1noc(-c2nnc3n2CCN(C(=O)c2ccc(F)cc2)[C@@H]3C)n1,0
2,13387,CC(C)(C)Cn1c(N)nc2ccc(-c3nc(C(C)(C)C)[nH]c3-c3...,0
3,10258,CC(C)(C)C1CCC2(CC1)CCN(c1ccc(OC(F)(F)F)cc1)C(=...,1
4,5612,CCN(CC)c1ccc2cc(C(=O)NCCCCN3CCC(Nc4nc5ccccc5n4...,1
...,...,...,...
2685,10596,COCCN(C)Cc1csc(-c2cn(CC3CCOCC3)c3c(Cl)cccc23)n1,1
2686,12520,CC(C)N(CCO)Cc1csc(-c2cn(CC3CCOCC3)c3c(Cl)cccc2...,1
2687,2924,CC(C)N(CCO)Cc1nc(-c2cn(CC3CCOCC3)c3c(Cl)cccc23...,1
2688,1138,CCN(CC)Cc1csc(-c2cn(CC3CCOCC3)c3c(F)cccc23)n1,1


In [6]:
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
import pandas as pd


# Function to convert SMILES to Morgan fingerprints
def smiles_to_fp(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    return list(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits))


# Prepare the data
def prepare_data(df):
    df['features'] = df['Drug'].apply(lambda x: smiles_to_fp(x))
    X = list(df['features'])
    y = df['Y'].values
    return X, y


# Load data
train_data = split['train'] 
X_train, y_train = prepare_data(train_data)


# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


# Evaluate the model
test_data = split['train'] 
X_test, y_test = prepare_data(test_data)
y_pred = model.predict(X_test)


In [7]:
len(X_train[0])

1024

# Evaluation

In [8]:
from evaluate import eval
eval(y_test, y_pred)

Accuracy: 0.9941557751567315
ROC AUC Score: 0.9941571658369023
Precision: 0.9938113529662825
Recall: 0.9944480034166133
F1 Score: 0.9941295762621412
Matthews Correlation Coefficient: 0.9883115195647201
Specificity (Negative Prediction Accuracy): 0.9938663282571912


# Logic Tensor Network model

In [9]:
import torch

# Example of converting lists to Tensors
# Assuming 'X_train' and 'y_train' are your features and labels respectively and are initially lists

X_train = torch.tensor(X_train, dtype=torch.float32)  # Convert features to a float Tensor
y_train = torch.tensor(y_train, dtype=torch.float32)  # Convert labels to a float Tensor

# Ensure labels y_train is the right shape (e.g., for BCELoss, you might need to ensure it's two-dimensional if there are two outputs)
y_train = y_train.unsqueeze(1)  # Only do this if necessary

 # LTN: Define the Network

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim

class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(1024, 64),  # Assuming input features size of 1024
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1),  # binary classification
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)


# LTN: Define loss with Logical Constraint

In [11]:
import torch.nn.functional as F

def custom_logic_loss(outputs, targets):
    criterion = nn.BCELoss()
    classical_loss = criterion(outputs.squeeze(), targets.squeeze())
    
    # You can still implement a custom logic penalty if needed
    # For example, adding any additional logic-based constraints
    
    return classical_loss  # + any logic-based penalties


"""
def custom_logic_loss(outputs, targets):
    criterion = nn.BCELoss()
    # Assuming outputs and targets are already tensors and appropriately shaped
    classical_loss = criterion(outputs, targets)

    # If there are two logical outputs being evaluated:
    if outputs.shape[1] > 1:
        # Implementing the logical constraint:
        # If outputs[:,0] > 0.9 then outputs[:,1] < 0.1
        logic_condition = outputs[:, 0] > 0.9
        logic_penalty = torch.where(logic_condition, 1.0 - outputs[:, 1], torch.tensor(0.0).to(outputs.device))
        logic_penalty = logic_penalty.mean()  # Mean penalty per batch
    else:
        logic_penalty = torch.tensor(0.0).to(outputs.device)  # No penalty if not applicable

    return classical_loss + logic_penalty
"""    


'\ndef custom_logic_loss(outputs, targets):\n    criterion = nn.BCELoss()\n    # Assuming outputs and targets are already tensors and appropriately shaped\n    classical_loss = criterion(outputs, targets)\n\n    # If there are two logical outputs being evaluated:\n    if outputs.shape[1] > 1:\n        # Implementing the logical constraint:\n        # If outputs[:,0] > 0.9 then outputs[:,1] < 0.1\n        logic_condition = outputs[:, 0] > 0.9\n        logic_penalty = torch.where(logic_condition, 1.0 - outputs[:, 1], torch.tensor(0.0).to(outputs.device))\n        logic_penalty = logic_penalty.mean()  # Mean penalty per batch\n    else:\n        logic_penalty = torch.tensor(0.0).to(outputs.device)  # No penalty if not applicable\n\n    return classical_loss + logic_penalty\n'

# LTN: Training

In [None]:
# Assuming X_train and y_train are your datasets loaded as Tensor objects
model = SimpleNN()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Training loop
epoch_n = 200
model.train()
for epoch in range(epoch_n):
    optimizer.zero_grad()
    outputs = model(X_train)  # Make sure X_train is a tensor
    loss = custom_logic_loss(outputs, y_train)  # Ensure y_train is appropriately shaped
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
    

Epoch 1, Loss: 0.694512128829956
Epoch 2, Loss: 0.6919741630554199
Epoch 3, Loss: 0.6896091103553772
Epoch 4, Loss: 0.686947762966156
Epoch 5, Loss: 0.6837431788444519
Epoch 6, Loss: 0.6799231767654419
Epoch 7, Loss: 0.6754501461982727
Epoch 8, Loss: 0.6703324317932129
Epoch 9, Loss: 0.6645869016647339
Epoch 10, Loss: 0.6582450270652771
Epoch 11, Loss: 0.6513213515281677
Epoch 12, Loss: 0.6438111066818237
Epoch 13, Loss: 0.6357401609420776
Epoch 14, Loss: 0.6271701455116272
Epoch 15, Loss: 0.6181892156600952
Epoch 16, Loss: 0.6089009046554565
Epoch 17, Loss: 0.5994260311126709
Epoch 18, Loss: 0.5898760557174683
Epoch 19, Loss: 0.5803619027137756
Epoch 20, Loss: 0.5710095167160034
Epoch 21, Loss: 0.5619361996650696
Epoch 22, Loss: 0.55324786901474
Epoch 23, Loss: 0.5450178384780884
Epoch 24, Loss: 0.5372676253318787
Epoch 25, Loss: 0.5299764275550842
Epoch 26, Loss: 0.5231009125709534
Epoch 27, Loss: 0.5165703892707825
Epoch 28, Loss: 0.5103147625923157
Epoch 29, Loss: 0.504278481006622

# LTN: Testing

In [12]:
# Assuming X_test and y_test are already loaded and are numpy arrays or lists
X_test = torch.tensor(X_test, dtype=torch.float32)  # Convert to tensor
# Ensure y_test is also a tensor if you have it and will evaluate metrics
y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)  # Convert to tensor and ensure correct shape


In [13]:
# Set model to evaluation mode
model.eval()

# Disable gradient computation for testing (saves memory and computations)
with torch.no_grad():
    outputs = model(X_test)


In [14]:
# Convert outputs to predicted classes
# For binary classification with a single output unit
predictions = (outputs > 0.5).float()  # Threshold probabilities to classify as 1 or 0

# Calculate accuracy or other metrics
accuracy = (predictions == y_test).float().mean()
print("Accuracy:", accuracy.item())

# If you need to calculate other metrics such as precision, recall, and F1 score
from sklearn.metrics import precision_score, recall_score, f1_score

# Since you cannot use sklearn directly with tensors, you need to move data back to CPU and convert to numpy
predictions = predictions.cpu().numpy()
y_test = y_test.cpu().numpy()

eval(y_test, predictions)

Accuracy: 0.9956438541412354
Accuracy: 0.9956438589035274
ROC AUC Score: 0.9956451106303099
Precision: 0.9963883577650308
Recall: 0.9949087823504454
F1 Score: 0.9956480203800021
Matthews Correlation Coefficient: 0.9912888112971575
Specificity (Negative Prediction Accuracy): 0.9963814389101745
