Importing Necessary Modules

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, precision_recall_curve, auc
from scipy.stats import pearsonr
import numpy as np
import pandas as pd
from lifelines.utils import concordance_index


Loading Dataset 

In [2]:

def load_kiba_dataset(file_path):
    df = pd.read_csv(file_path, sep=' ', header=None)
    df.columns = ["Drug", "Protein", "SMILES", "Sequence", "Affinity"]
    return df


Encoding sequnence accordingly given in document i gave each amino acid a sequence of 20 numbers and same for Similis

In [3]:
def encode_sequences(df):
    amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
    aa_to_num = {aa: i+1 for i, aa in enumerate(amino_acids)}
    
    max_seq_len = max(df['Sequence'].apply(len))
    df['Encoded_Sequence'] = df['Sequence'].apply(lambda seq: [aa_to_num.get(aa, 0) for aa in seq] + [0] * (max_seq_len - len(seq)))
    
    max_smiles_len = max(df['SMILES'].apply(len))
    char_set = set("".join(df['SMILES']))
    char_to_num = {char: i+1 for i, char in enumerate(char_set)}
    df['Encoded_SMILES'] = df['SMILES'].apply(lambda smi: [char_to_num.get(char, 0) for char in smi] + [0] * (max_smiles_len - len(smi)))
    
    return df


Taking two cases as given in document
a) no new protein in testing
b) new protein in testing as mentioned in discussion

In [4]:
def train_test_split_proteins(df, new_proteins=True):
    if new_proteins:
        proteins = df['Protein'].unique()
        train_proteins, test_proteins = train_test_split(proteins, test_size=42 / len(proteins), random_state=42)
        train_data = df[df['Protein'].isin(train_proteins)]
        test_data = df[df['Protein'].isin(test_proteins)]
    else:
        train_data = []
        test_data = []
        for protein in df['Protein'].unique():
            protein_data = df[df['Protein'] == protein]
            train_part = protein_data.sample(frac=0.7, random_state=42)
            test_part = protein_data.drop(train_part.index)
            train_data.append(train_part)
            test_data.append(test_part)
        train_data = pd.concat(train_data)
        test_data = pd.concat(test_data)
    
    return train_data, test_data

Loading dataset into Tensors

In [5]:
def prepare_dataloader(df, batch_size=32):
    X_smiles = np.array(df['Encoded_SMILES'].tolist())
    X_sequence = np.array(df['Encoded_Sequence'].tolist())
    y = np.array(df['Affinity'].tolist())

    X = np.hstack([X_smiles, X_sequence])
    dataset = torch.utils.data.TensorDataset(torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32))
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    return dataloader


Training model using 4 layers of Neural netwrosk

In [6]:
class DrugProteinNN(nn.Module):
    def __init__(self, input_size):
        super(DrugProteinNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, 1)

    def forward(self, x):
        x = nn.ReLU()(self.bn1(self.fc1(x)))
        x = nn.ReLU()(self.bn2(self.fc2(x)))
        x = nn.ReLU()(self.bn3(self.fc3(x)))
        x = self.fc4(x)
        return x


def train_model(model, train_loader, criterion, optimizer, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")


Evaluating 5 parameters as mentioned in docs

In [7]:
def evaluate_model_with_metrics(model, test_loader):
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch).squeeze()
            y_true.extend(y_batch.numpy())
            y_pred.extend(outputs.numpy())

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    ci = concordance_index(y_true, y_pred)
    pearson_corr, _ = pearsonr(y_true, y_pred)

    precision, recall, _ = precision_recall_curve((y_true > 12.1).astype(int), y_pred)
    aupr = auc(recall, precision)

    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"R² Score: {r2:.4f}")
    print(f"Concordance Index (CI): {ci:.4f}")
    print(f"Pearson Correlation (R): {pearson_corr:.4f}")
    print(f"Area Under Precision-Recall Curve (AUPR): {aupr:.4f}")

    return mse, r2, ci, pearson_corr, aupr


Main function which calls other functions

In [None]:
if __name__ == "__main__":
    datasets = ["davis.txt", "davis-filter.txt","kiba.txt"]
    cases = [
        {"name": "Seen Proteins", "new_proteins": False},
        {"name": "New Proteins", "new_proteins": True}
    ]

    for file_path in datasets:
        print(f"Processing dataset: {file_path}")
        df = load_kiba_dataset(file_path)
        df = encode_sequences(df)

        for case in cases:
            print(f"\nCase: {case['name']}")
            train_data, test_data = train_test_split_proteins(df, new_proteins=case['new_proteins'])
            train_loader = prepare_dataloader(train_data)
            test_loader = prepare_dataloader(test_data)

            input_size = train_loader.dataset[0][0].shape[0]
            model = DrugProteinNN(input_size)

            criterion = nn.MSELoss()
            optimizer = optim.Adam(model.parameters(), lr=0.001)

            print("Training the model...")
            train_model(model, train_loader, criterion, optimizer, epochs=20)

            print("Evaluating the model...")
            evaluate_model_with_metrics(model, test_loader)


Processing dataset: davis.txt

Case: Seen Proteins
Training the model...
Epoch 1/20, Loss: 1.8934
Epoch 2/20, Loss: 0.6342
Epoch 3/20, Loss: 0.5766
Epoch 4/20, Loss: 0.5518
Epoch 5/20, Loss: 0.5373
Epoch 6/20, Loss: 0.5163
Epoch 7/20, Loss: 0.4852
Epoch 8/20, Loss: 0.4676
Epoch 9/20, Loss: 0.4479
Epoch 10/20, Loss: 0.4238
