In [1]:
!pip install rdkit
!pip install duckdb
!pip install pandas networkx
!pip install torch
!pip install torch-geometric

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import duckdb
from torch.utils.data import Dataset

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2023.9.6
Collecting duckdb
  Downloading duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (762 bytes)
Downloading duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-1.0.0
Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━

In [2]:
train_path = '/kaggle/input/leash-BELKA/train.parquet'
test_path = '/kaggle/input/leash-BELKA/test.parquet'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 0
                        ORDER BY random()
                        LIMIT 30000)
                        UNION ALL
                        (SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT 30000)""").df()

con.close()

df = df.drop(['buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles'], axis=1)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [3]:
df.sample(n=20)
# print(df)
# rows, col= smiles_df.shape
# print(f"Number of rows: {rows}, Number of rows: {col}")

Unnamed: 0,id,molecule_smiles,protein_name,binds
35906,103968082,O=C(N[Dy])[C@H](Cc1cccnc1)Nc1nc(NCC=Cc2cccnc2)...,HSA,1
47123,53251707,Cc1cc(Cl)cc(C(=O)N[Dy])c1Nc1nc(NCCS(=O)(=O)Nc2...,BRD4,1
48829,213799542,COc1ccc(CNc2nc(Nc3nnn[nH]3)nc(Nc3cccc(I)c3C(=O...,BRD4,1
34609,218981577,CCOC(=O)c1cccnc1Nc1nc(Nc2ccnc(C(=O)N[Dy])c2)nc...,BRD4,1
25986,122819258,O=C(N[Dy])[C@@H](Cc1ccc(Cl)cc1)Nc1nc(NCc2cc(=O...,sEH,0
59787,55043070,Cc1ccc(C(=O)N[Dy])cc1Nc1nc(NCc2ccc(CN3CCCC3=O)...,BRD4,1
34567,90950766,Cn1cc(Nc2nc(NCc3cc(Br)no3)nc(N[C@@H](Cc3c(F)c(...,BRD4,1
9617,130893226,CC(CNc1nc(NC[C@@H]2OCCN(C)[C@H]2c2cnn(C)c2)nc(...,HSA,0
6766,95644808,CCOC(=O)c1c[nH]nc1Nc1nc(Nc2cc(F)c(F)cc2Br)nc(N...,sEH,0
41687,82754597,CCOC(=O)c1ncccc1Nc1nc(NCC2CCC(C(=O)N[Dy])CC2)n...,sEH,1


In [4]:
from rdkit import Chem
from rdkit.Chem import AllChem
import torch
from torch_geometric.data import Data

def smiles_to_graph(smiles, protein_encoding):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    AllChem.Compute2DCoords(mol)
    num_atoms = mol.GetNumAtoms()

    # Node features
    atom_features = []
    for atom in mol.GetAtoms():
        atom_features.append(atom.GetAtomicNum())
    atom_features = torch.tensor(atom_features, dtype=torch.float).view(-1, 1)

    # Edge indices
    edge_indices = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_indices.append((i, j))
        edge_indices.append((j, i))
    edge_indices = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

    # Protein encoding
    protein_features = torch.tensor(protein_encoding, dtype=torch.float)

    return Data(x=atom_features, edge_index=edge_indices, protein=protein_features)

In [5]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
protein_encoded = encoder.fit_transform(df[['protein_name']])
df['protein_encoded'] = list(protein_encoded)

In [6]:
graph_data_list = []
for index, row in df.iterrows():
    graph = smiles_to_graph(row['molecule_smiles'], row['protein_encoded'])
    if graph is not None:
        graph.y = torch.tensor([row['binds']], dtype=torch.float)
        graph_data_list.append(graph)

In [7]:
def normalize_features(features):
    mean = features.mean(dim=0, keepdim=True)
    std = features.std(dim=0, keepdim=True)
    return (features - mean) / (std + 1e-6)

for data in graph_data_list:
    data.x = normalize_features(data.x)

In [8]:
from torch.utils.data import Dataset
from torch_geometric.loader import DataLoader

class MoleculeDataset(Dataset):
    def __init__(self, data_list):
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        return self.data_list[idx]

dataset = MoleculeDataset(graph_data_list)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [9]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool

class GCNWithProtein(torch.nn.Module):
    def __init__(self, protein_dim):
        super(GCNWithProtein, self).__init__()
        self.conv1 = GCNConv(1, 128)
        self.conv2 = GCNConv(128, 128)
        self.conv3 = GCNConv(128, 128)
        self.fc1 = torch.nn.Linear(128 + protein_dim, 256)
        self.fc2 = torch.nn.Linear(256, 1)
        self.dropout = torch.nn.Dropout(p=0.5)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        protein = data.protein

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = global_mean_pool(x, batch)

        protein = protein.view(batch.max().item() + 1, -1)

        x = torch.cat([x, protein], dim=1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Get the protein dimension from one-hot encoding
protein_dim = len(encoder.categories_[0])
model = GCNWithProtein(protein_dim)

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()

def train():
    model.train()
    for data in loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y.view(-1, 1))
        loss.backward()
        optimizer.step()

for epoch in range(200):  
    train()

In [None]:
# Predict on new data
new_smiles = ['C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C)cc2)n1)C(=O)N[Dy]', 'C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C)cc2)n1)C(=O)N[Dy]', 'C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2ncn3CC(C)O)n1)C(=O)N[Dy]']
new_proteins = ['BRD4', 'sEH', 'BRD4']
new_protein_df = pd.DataFrame(new_proteins, columns=['protein_name'])
new_protein_encoded = encoder.transform(new_protein_df)
new_graph_data_list = []

for smiles, protein_encoding in zip(new_smiles, new_protein_encoded):
    try:
        graph = smiles_to_graph(smiles, protein_encoding)
        if graph is not None:
            new_graph_data_list.append(graph)
    except Exception as e:
        print(f"Error processing SMILES '{smiles}': {e}")

new_loader = DataLoader(new_graph_data_list, batch_size=1, shuffle=False)

predictions = []
with torch.no_grad():
    for data in new_loader:
        logits = model(data)
        prediction = torch.sigmoid(logits).item()
        predictions.append(prediction)

# Print the predictions
for smiles, protein, pred in zip(new_smiles, new_proteins, predictions):
    print(f'SMILES: {smiles}, Protein: {protein}, Predicted Binding Affinity: {pred}')