In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

In [None]:
# Define the dataset class
class DrugTargetDataset(Dataset):
    def __init__(self, df, tokenizers):
        self.df = df
        self.tokenizers = tokenizers

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Tokenize SMILES and Prot_seq
        smiles = self.tokenizers[0].texts_to_sequences([self.df.iloc[idx]["smiles"]])[0]
        prot_seq = self.tokenizers[1].texts_to_sequences([self.df.iloc[idx]["prot_seq"]])[0]

        # Convert to PyTorch tensor
        smiles = torch.LongTensor(smiles)
        prot_seq = torch.LongTensor(prot_seq)
        label = torch.LongTensor([self.df.iloc[idx]["class"]])

        return smiles, prot_seq, label

In [None]:
# Load the dataset
df = pd.read_csv("data/davis_filtered.csv")
smiles_tokenizer = Tokenizer(char_level=True)
smiles_tokenizer.fit_on_texts(df["smiles"])
prot_tokenizer = Tokenizer(char_level=False)
prot_tokenizer.fit_on_texts(df["prot_seq"])
le = LabelEncoder()
le.fit(df["class"])
df["class"] = le.transform(df["class"])
dataset = DrugTargetDataset(df, [smiles_tokenizer, prot_tokenizer])

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Define the data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
# Define the CNN model
class CNN(nn.Module):
    def __init__(self, vocab_size_smiles, vocab_size_prot, embedding_dim, num_filters):
        super(CNN, self).__init__()

        # SMILES embedding layer
        self.embedding_smiles = nn.Embedding(vocab_size_smiles, embedding_dim)

        # Protein embedding layer
        self.embedding_prot = nn.Embedding(vocab_size_prot, embedding_dim)

        # Convolutional layers
        self.conv1_smiles = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=5)
        self.conv2_smiles = nn.Conv1d(in_channels=num_filters, out_channels=num_filters, kernel_size=5)
        self.conv1_prot = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=5)
        self.conv2_prot = nn.Conv1d(in_channels=num_filters, out_channels=num_filters, kernel_size=5)

        # Max pooling layers
        self.pool1_smiles = nn.MaxPool1d(kernel_size=2)
        self.pool2_smiles = nn.MaxPool1d(kernel_size=2)
        self.pool1_prot = nn.MaxPool1d(kernel_size=2)
        self.pool2_prot = nn.MaxPool1d(kernel_size=2)

        # Fully connected layers
        self.fc1 = nn.Linear(num_filters*2*25, 128)
        self.fc2 = nn.Linear(128, 19)

    def forward(self, x_smiles, x_prot):
        # SMILES forward pass
        x_smiles = self.embedding_smiles(x_smiles)
        x_smiles = x_smiles.permute(0, 2, 1)
        x_smiles = self.pool1_smiles(nn.functional.relu(self.conv1_smiles(x_smiles)))
        x_smiles = self.pool2_smiles(nn.functional.relu(self.conv2_smiles(x_smiles)))
        x_smiles = x_smiles.view(-1, 2*25*num_filters)

        # Protein forward pass
        x_prot = self.embedding_prot(x_prot)
        x_prot = x_prot.permute(0, 2, 1)
        x_prot = self.pool1_prot(nn.functional.relu(self.conv1_prot(x_prot)))
        x_prot = self.pool2_prot(nn.functional.relu(self.conv2_prot(x_prot)))
        x_prot = x_prot.view(-1, 2*25*num_filters)

        # Concatenate SMILES and Protein features
        x = torch.cat((x_smiles, x_prot), dim=1)

        # Fully connected layers
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.softmax(self.fc2(x), dim=1)

        return x

In [None]:
# Initialize the model
vocab_size_smiles = len(smiles_tokenizer.word_index) + 1
vocab_size_prot = len(prot_tokenizer.word_index) + 1
embedding_dim = 128
num_filters = 32
lr = 0.001
model = CNN(vocab_size_smiles, vocab_size_prot, embedding_dim, num_filters)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # Get the inputs
        inputs_smiles, inputs_prot, labels = data

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(inputs_smiles, inputs_prot)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        if i % 100 == 99:    # print every 100 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0

print('Finished Training')

# Evaluate the model on the validation set
model.eval()  # set the model to evaluation mode
correct = 0
total = 0
with torch.no_grad():
    for data in val_loader:
        inputs_smiles, inputs_prot, labels = data
        outputs = model(inputs_smiles, inputs_prot)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

val_acc = correct / total
print(f'Validation accuracy: {val_acc:.4f}')



# Web scraping Drugbank

In [None]:
!pip install beautifulsoup4



In [None]:
import requests as req
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
table = pd.read_excel('/content/drug_nodes_identifiers.xlsx')

In [None]:
drug_id = table["drugbank-id"]

In [None]:
def extract_words_between_strings(sentence, start_string, end_string):
    extracted_words = []
    start_index = sentence.find(start_string)

    while start_index != -1:
        start_index += len(start_string)
        end_index = sentence.find(end_string, start_index)

        if end_index != -1:
            words = sentence[start_index:end_index].strip()
            extracted_words.append(words)
            start_index = sentence.find(start_string, end_index)
        else:
            break

    return extracted_words

In [None]:
start_string = "Associated Conditions"
end_string = "Contraindications"
baseURL = "https://go.drugbank.com/drugs/"

drug_cond = pd.DataFrame()

In [None]:
for dbid in drug_id:
  page = req.get(baseURL+str(dbid))
  soup = BeautifulSoup(page.content, "html.parser")
  content = soup.text
  # print(content)
  disease_list = extract_words_between_strings(content, start_string, end_string)
  disease_list = [disease for disease in disease_list if disease.strip()]
  disease_names = []
  for disease in disease_list:
      disease_names.extend(disease.split('\n'))
  disease_names = [name.strip() for name in disease_names]
  data = pd.DataFrame({"assoc_cond": disease_names})
  data["drugbank-id"] = dbid
  drug_cond = drug_cond.append(data).reset_index(drop=True)

In [None]:
df_filtered = drug_cond[drug_cond['assoc_cond'] != "Associated Therapies"]
df_filtered = df_filtered[df_filtered['assoc_cond'] != ""]

In [None]:
df_filtered['assoc_cond'] = df_filtered['assoc_cond'].str.replace(r'(.*)Associated Therapies.*', r'\1', regex=True)

In [None]:
df_filtered

Unnamed: 0,assoc_cond,drugbank-id
0,Locally Advanced Squamous Cell Carcinomas of t...,DB00002
1,Metastatic Colorectal Cancer (CRC),DB00002
2,Metastatic Squamous Cell Carcinoma of the Head...,DB00002
3,Regionally Advanced Squamous Cell Carcinoma of...,DB00002
4,Recurrent Squamous Cell Carcinoma of the Head ...,DB00002
...,...,...
15006,Coronavirus Disease 2019 (COVID‑19),DB17090
15007,Coronavirus Disease 2019 (COVID‑19),DB17091
15008,Coronavirus Disease 2019 (COVID‑19),DB17095
15009,High risk BCG-unresponsive non-muscle invasive...,DB17381


No charts were generated by quickchart


In [None]:
df_filtered.to_csv("DBID_AssocCondn.csv", drop_index=True)