In [1]:
# Install necessary packages
!pip install rdkit
!pip install duckdb
!pip install pandas networkx
!pip install torch
!pip install torch-geometric

# Import libraries
import numpy as np 
import pandas as pd 
import duckdb
from torch.utils.data import Dataset
from rdkit import Chem
from rdkit.Chem import AllChem
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from sklearn.preprocessing import LabelEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2023.9.6
Collecting duckdb
  Downloading duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (762 bytes)
Downloading duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-1.0.0
Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━

In [2]:
train_path = '/kaggle/input/leash-BELKA/train.parquet'
test_path = '/kaggle/input/leash-BELKA/test.parquet'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 0
                        ORDER BY random()
                        LIMIT 30000)
                        UNION ALL
                        (SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT 30000)""").df()

con.close()

df = df.drop(['buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles'], axis=1)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [3]:
pd.set_option('display.max_colwidth', None)
df.sample(n=10)

Unnamed: 0,id,molecule_smiles,protein_name,binds
42116,82324448,NC(=O)NCCNc1nc(NCCC(=O)NCc2ccccc2)nc(NCC2CCC(C(=O)N[Dy])CC2)n1,sEH,1
52342,222068000,O=C(N[Dy])c1ccncc1Nc1nc(NCCc2ccno2)nc(Nc2ccc(Cn3ccnc3)cc2)n1,sEH,1
34069,81970433,COc1ccc(Cl)c(Nc2nc(NCCc3nc4c(C)cccc4o3)nc(NCC3CCC(C(=O)N[Dy])CC3)n2)c1,sEH,1
42557,271798874,COC(=O)c1cncc(Nc2nc(NCc3nccn3-c3ccccc3)nc(N[C@H](CC(=O)N[Dy])Cc3ccc([N+](=O)[O-])cc3)n2)c1,sEH,1
27165,276223083,Cc1nc(CNc2nc(NCCC3CCCC3(F)F)nc(N3CCC[C@@H]3CC(=O)N[Dy])n2)oc1C,BRD4,0
11948,176059178,O=C(N[Dy])c1ccc(F)cc1Nc1nc(NCc2cnc(Cl)s2)nc(NCc2cnc(-c3ccccc3)s2)n1,sEH,0
55062,239697409,C=CCOCCNc1nc(NCC(=O)N[Dy])nc(Nc2ncnc3[nH]ncc23)n1,HSA,1
46348,178093612,O=C(N[Dy])c1ccc([N+](=O)[O-])cc1Nc1nc(NCc2cncc(F)c2)nc(NCc2cnc(Cl)s2)n1,HSA,1
10878,258090399,Cc1cccc(Nc2nc(Nc3cccc(-n4cncn4)c3)nc(N[C@H](CC(=O)N[Dy])c3ccc(Cl)cc3)n2)c1Cl,BRD4,0
14546,105146004,Cc1ccc(CNc2nc(NCC3CC(CC(N)=O)CO3)nc(N[C@@H](Cc3ccco3)C(=O)N[Dy])n2)n1C,BRD4,0


In [4]:
# Encode protein names to numerical values
protein_encoder = LabelEncoder()
protein_encoder.fit(['HSA', 'BRD4', 'sEH'])

# Function to convert SMILES and protein to graph
def smiles_to_graph(smiles, protein):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Convert molecule to graph data structure
    nodes = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    edges = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) for bond in mol.GetBonds()]
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    x = torch.tensor(nodes, dtype=torch.float).view(-1, 1)

    # Encode protein as a feature
    protein_encoded = protein_encoder.transform([protein])[0]
    protein_feature = torch.tensor([protein_encoded], dtype=torch.float)

    # Add protein feature to each node
    protein_features = protein_feature.repeat(x.size(0), 1)
    x = torch.cat([x, protein_features], dim=1)

    return Data(x=x, edge_index=edge_index)

# Apply the function to the dataframe
df['graph'] = df.apply(lambda row: smiles_to_graph(row['molecule_smiles'], row['protein_name']), axis=1)

# Filter out invalid graphs
df = df[df['graph'].notnull()]

# Example of accessing a graph
# example_graph = df['graph'].iloc[0]
# print(example_graph)

In [5]:
class MoleculeDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        data = self.dataframe.iloc[idx]
        graph = data['graph']
        label = torch.tensor(data['binds'], dtype=torch.long)
        return graph, label

# Create dataset and data loader
dataset = MoleculeDataset(df)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [6]:
import torch.nn.functional as F
from torch_geometric.nn import GATConv, global_mean_pool

class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=2):
        super(GAT, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads)
        self.conv2 = GATConv(hidden_dim * heads, output_dim, heads=heads)
        self.output_dim = output_dim  # Add output_dim as an attribute

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = global_mean_pool(x, batch)  # Pooling to get graph-level representation
        return F.log_softmax(x, dim=1)

# Example model instantiation
input_dim = 2  # 1 for atomic number + 1 for encoded protein feature
hidden_dim = 8
output_dim = 2  # Binary classification: bind (1) or not bind (0)
model = GAT(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)

In [9]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

epochs = 20
model.train()
for epoch in range(epochs):
    total_loss = 0
    for data, labels in data_loader:
        optimizer.zero_grad()
        out = model(data)
        # print(f"Output shape: {out.shape}, Labels shape: {labels.shape}")  
        if out.shape[0] != labels.shape[0]:  
            raise ValueError(f"Output batch size {out.shape[0]} does not match target batch size {labels.shape[0]}")
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(data_loader)}')

Epoch 1, Loss: 0.6872092079798381
Epoch 2, Loss: 0.6859827471097311
Epoch 3, Loss: 0.6862127470016479
Epoch 4, Loss: 0.6861845527648925
Epoch 5, Loss: 0.686002317905426
Epoch 6, Loss: 0.6860190305391948
Epoch 7, Loss: 0.6859004495938619
Epoch 8, Loss: 0.6859833953857422
Epoch 9, Loss: 0.6857061873753866
Epoch 10, Loss: 0.6858558313687643
Epoch 11, Loss: 0.6856628608385722
Epoch 12, Loss: 0.6856004294077556
Epoch 13, Loss: 0.6855932339032491
Epoch 14, Loss: 0.6855092583656311
Epoch 15, Loss: 0.6853770582834879
Epoch 16, Loss: 0.6853538215637207
Epoch 17, Loss: 0.6844786180178324
Epoch 18, Loss: 0.683842852306366
Epoch 19, Loss: 0.6850246935208638
Epoch 20, Loss: 0.6843383451779683


In [11]:
def evaluate(model, data_loader):
    model.eval()
    correct = 0
    for data, labels in data_loader:
        out = model(data)
        # print(f"Output shape: {out.shape}, Labels shape: {labels.shape}")  # Debug print
        if out.shape[0] != labels.shape[0]:  # Check to ensure shapes match
            raise ValueError(f"Output batch size {out.shape[0]} does not match target batch size {labels.shape[0]}")
        pred = out.argmax(dim=1)
        correct += (pred == labels).sum().item()
    accuracy = correct / len(data_loader.dataset)
    return accuracy

# Evaluate the model
accuracy = evaluate(model, data_loader)
print('Accuracy:', accuracy)

Accuracy: 0.5625666666666667
