In [1]:
# Install necessary packages
!pip install rdkit
!pip install duckdb
!pip install pandas networkx
!pip install torch
!pip install torch-geometric

# Import libraries
import numpy as np 
import pandas as pd 
import duckdb
from torch.utils.data import Dataset
from rdkit import Chem
from rdkit.Chem import AllChem
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from sklearn.preprocessing import LabelEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2023.9.6
Collecting duckdb
  Downloading duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (762 bytes)
Downloading duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-1.0.0
Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
train_path = '/kaggle/input/leash-BELKA/train.parquet'
test_path = '/kaggle/input/leash-BELKA/test.csv'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 0
                        ORDER BY random()
                        LIMIT 30000)
                        UNION ALL
                        (SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT 30000)""").df()

con.close()

df = df.drop(['buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles'], axis=1)

test_df = pd.read_csv(test_path)
test_df = test_df.drop(['buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles'], axis=1)
print(test_df.head())

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

          id                                    molecule_smiles protein_name
0  295246830  C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...         BRD4
1  295246831  C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...          HSA
2  295246832  C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...          sEH
3  295246833  C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...         BRD4
4  295246834  C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...          HSA


In [3]:
# pd.set_option('display.max_colwidth', None)
# df.sample(n=10)

In [4]:
# Encode protein names to numerical values
protein_encoder = LabelEncoder()
protein_encoder.fit(['HSA', 'BRD4', 'sEH'])

# Function to convert SMILES and protein to graph
def smiles_to_graph(smiles, protein):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Convert molecule to graph data structure
    nodes = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    edges = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) for bond in mol.GetBonds()]
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    x = torch.tensor(nodes, dtype=torch.float).view(-1, 1)

    # Encode protein as a feature
    protein_encoded = protein_encoder.transform([protein])[0]
    protein_feature = torch.tensor([protein_encoded], dtype=torch.float)

    # Add protein feature to each node
    protein_features = protein_feature.repeat(x.size(0), 1)
    x = torch.cat([x, protein_features], dim=1)

    return Data(x=x, edge_index=edge_index)

# Apply the function to the training dataframe
df['graph'] = df.apply(lambda row: smiles_to_graph(row['molecule_smiles'], row['protein_name']), axis=1)

# Filter out invalid graphs in the training dataframe
df = df[df['graph'].notnull()]

# Apply the function to the test dataframe
test_df['graph'] = test_df.apply(lambda row: smiles_to_graph(row['molecule_smiles'], row['protein_name']), axis=1)

# Filter out invalid graphs in the test dataframe
test_df = test_df[test_df['graph'].notnull()]


In [5]:
# Custom Dataset Class
class MoleculeDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        data = self.dataframe.iloc[idx]
        graph = data['graph']
        label = torch.tensor(data['binds'], dtype=torch.long)
        return graph, label
    
class TestMoleculeDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        data = self.dataframe.iloc[idx]
        graph = data['graph']
        return graph

# Create dataset and data loader
dataset = MoleculeDataset(df)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Create dataset and data loader for test data
test_dataset = TestMoleculeDataset(test_df)
test_data_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [6]:
import torch.nn.functional as F
from torch_geometric.nn import GATConv, global_mean_pool

class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=2):
        super(GAT, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads)
        self.conv2 = GATConv(hidden_dim * heads, output_dim, heads=heads)
        self.output_dim = output_dim  # Add output_dim as an attribute

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = global_mean_pool(x, batch)  # Pooling to get graph-level representation
        return F.softmax(x, dim=1)  # Return softmax probabilities

# Example model instantiation
input_dim = 2  # 1 for atomic number + 1 for encoded protein feature
hidden_dim = 8
output_dim = 2  # Binary classification: bind (1) or not bind (0)
model = GAT(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)

In [7]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
epochs = 20
model.train()
for epoch in range(epochs):
    total_loss = 0
    for data, labels in data_loader:
        optimizer.zero_grad()
        out = model(data)
        # print(f"Output shape: {out.shape}, Labels shape: {labels.shape}")  # Debug print
        if out.shape[0] != labels.shape[0]:  # Check to ensure shapes match
            raise ValueError(f"Output batch size {out.shape[0]} does not match target batch size {labels.shape[0]}")
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(data_loader)}')

Epoch 1, Loss: 1.1624087738672892
Epoch 2, Loss: 1.155278500366211
Epoch 3, Loss: 1.15523627243042
Epoch 4, Loss: 1.155136761156718
Epoch 5, Loss: 1.1547163132985432
Epoch 6, Loss: 1.1547598248799642
Epoch 7, Loss: 1.1546796047846477
Epoch 8, Loss: 1.15456640364329
Epoch 9, Loss: 1.154810484377543
Epoch 10, Loss: 1.1542330872217814
Epoch 11, Loss: 1.1541615193049113
Epoch 12, Loss: 1.1541994544347127
Epoch 13, Loss: 1.1543464658101399
Epoch 14, Loss: 1.1539769917170206
Epoch 15, Loss: 1.1545216886520386
Epoch 16, Loss: 1.1543891159057618
Epoch 17, Loss: 1.1541189469337463
Epoch 18, Loss: 1.1540819494883219
Epoch 19, Loss: 1.1538853530883788
Epoch 20, Loss: 1.1535997004826863


In [8]:
def evaluate(model, data_loader):
    model.eval()
    correct = 0
    for data, labels in data_loader:
        out = model(data)
        # print(f"Output shape: {out.shape}, Labels shape: {labels.shape}")  # Debug print
        if out.shape[0] != labels.shape[0]:  # Check to ensure shapes match
            raise ValueError(f"Output batch size {out.shape[0]} does not match target batch size {labels.shape[0]}")
        pred = out.argmax(dim=1)
        correct += (pred == labels).sum().item()
    accuracy = correct / len(data_loader.dataset)
    return accuracy

# Evaluate the model
accuracy = evaluate(model, data_loader)
print('Accuracy:', accuracy)

Accuracy: 0.5622666666666667


In [9]:
def make_predictions(model, test_data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for data in test_data_loader:
            out = model(data)
            probs = out[:, 1]  # Use the probability of the positive class (binding)
            predictions.extend(probs.cpu().numpy())
    return predictions

# Use the prediction function to get predictions for the test data
test_predictions = make_predictions(model, test_data_loader)

# Add the predictions to the test dataframe
test_df['binds'] = test_predictions

# Select only the 'id' and 'binds' columns
output_df = test_df[['id', 'binds']]

# Save the dataframe to a CSV file
output_csv_path = '/kaggle/working/test_predictions.csv'
output_df.to_csv(output_csv_path, index=False)

# Display the saved file path
print(f'Saved predictions to {output_csv_path}')

Saved predictions to /kaggle/working/test_predictions.csv
