In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import duckdb
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder

In [17]:
train_path = './train.parquet'
test_path = './test.parquet'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 0
                        ORDER BY random()
                        LIMIT 30000)
                        UNION ALL
                        (SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT 30000)""").df()

con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [18]:
df.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,134382410,O=C(Nc1c(Br)cc(F)cc1C(=O)O)OCC1c2ccccc2-c2ccccc21,Cc1cc(O)cc(C)c1N,Cc1sc(CN)nc1C(C)C,Cc1cc(O)cc(C)c1Nc1nc(NCc2nc(C(C)C)c(C)s2)nc(Nc...,sEH,0
1,23794434,CCS(=O)(=O)c1cc(C(=O)O)c(OC)cc1NC(=O)OCC1c2ccc...,Nc1ccc(F)nc1,CC(C)(C)OC(=O)N1CCN(c2ccccc2N)CC1,CCS(=O)(=O)c1cc(C(=O)N[Dy])c(OC)cc1Nc1nc(Nc2cc...,BRD4,0
2,100275550,O=C(N[C@@H](Cc1ccc(I)cc1)C(=O)O)OCC1c2ccccc2-c...,CC(C)c1nnc([C@H]2C[C@H](CN)[C@H](O)C2)[nH]1,C=C(Cl)CN.Cl,C=C(Cl)CNc1nc(NC[C@H]2C[C@H](c3nnc(C(C)C)[nH]3...,HSA,0
3,42258805,CS(=O)(=O)c1ccc(C(=O)O)c(NC(=O)OCC2c3ccccc3-c3...,Nc1cc(Cl)cnc1Cl,Nc1ccc2ncoc2c1,CS(=O)(=O)c1ccc(C(=O)N[Dy])c(Nc2nc(Nc3ccc4ncoc...,HSA,0
4,189604502,O=C(Nc1ccc(C(=O)O)cc1Cl)OCC1c2ccccc2-c2ccccc21,CC(C)(C)OC(=O)n1ncc2cc(N)ccc21,Nc1ccc(Cl)c(F)c1,CC(C)(C)OC(=O)n1ncc2cc(Nc3nc(Nc4ccc(Cl)c(F)c4)...,sEH,0


# preprocess

In [20]:
from rdkit import Chem
import torch
from torch_geometric.data import Data

def molecule_to_graph(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return None

    atoms = molecule.GetAtoms()
    bonds = molecule.GetBonds()

    node_features = torch.tensor([atom.GetAtomicNum() for atom in atoms], dtype=torch.float).unsqueeze(1)
    
    edge_index = []
    edge_features = []
    for bond in bonds:
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        bond_type = bond.GetBondTypeAsDouble()
        # Append forward and backward directions
        edge_index.extend([(start, end), (end, start)])
        edge_features.extend([bond_type, bond_type])  # Same feature for both directions

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_features = torch.tensor(edge_features, dtype=torch.float).unsqueeze(1)  # Ensure it has the right shape

    return Data(x=node_features, edge_index=edge_index, edge_attr=edge_features)




In [None]:
# Example usage
smiles = 'CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1'
graph_data = molecule_to_graph(smiles)
if graph_data:
    print(graph_data)
else:
    print("Invalid SMILES string.")

In [30]:
def molecule_to_graph(smiles, protein_name, protein_to_idx, target_value):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return None

    atoms = molecule.GetAtoms()
    bonds = molecule.GetBonds()

    # Node features: Atomic number
    node_features = [atom.GetAtomicNum() for atom in atoms]
    node_features = torch.tensor(node_features, dtype=torch.float).unsqueeze(1)
    
    # Edge indices and features
    edge_index = []
    edge_features = []
    for bond in bonds:
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index.extend([(start, end), (end, start)])
        edge_features.extend([bond.GetBondTypeAsDouble(), bond.GetBondTypeAsDouble()])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_features = torch.tensor(edge_features, dtype=torch.float).unsqueeze(1)

    protein_idx = protein_to_idx[protein_name]

    return Data(x=node_features, edge_index=edge_index, edge_attr=edge_features, protein_idx=protein_idx, y=torch.tensor([target_value], dtype=torch.float))


In [31]:
# Assuming 'df' has a column 'molecule_smiles' containing SMILES strings
protein_to_idx = {name: idx for idx, name in enumerate(df['protein_name'].unique())}
# Assuming df has a 'binds' column with target values
data_list = [
    molecule_to_graph(row['molecule_smiles'], row['protein_name'], protein_to_idx, row['binds'])
    for index, row in df.iterrows()
]



In [None]:

# Example DataFrame setup
df = pd.DataFrame({
    'molecule_smiles': ['CCO', 'CCC', 'CCN'],  # example SMILES
    'protein_name': ['sEH', 'BRD4', 'HSA']
})

# Create a mapping from protein names to indices
protein_to_idx = {name: idx for idx, name in enumerate(df['protein_name'].unique())}

# Convert DataFrame rows to graph data
data_list = [molecule_to_graph(row['molecule_smiles'], row['protein_name'], protein_to_idx) for index, row in df.iterrows()]

In [32]:
data_list[:5]

[Data(x=[40, 1], edge_index=[2, 86], edge_attr=[86, 1], y=[1], protein_idx=0),
 Data(x=[52, 1], edge_index=[2, 112], edge_attr=[112, 1], y=[1], protein_idx=1),
 Data(x=[41, 1], edge_index=[2, 88], edge_attr=[88, 1], y=[1], protein_idx=2),
 Data(x=[40, 1], edge_index=[2, 88], edge_attr=[88, 1], y=[1], protein_idx=2),
 Data(x=[44, 1], edge_index=[2, 96], edge_attr=[96, 1], y=[1], protein_idx=0)]

In [33]:
import torch
import pandas as pd
from rdkit import Chem
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GATConv, global_mean_pool

# Assuming molecule_to_graph is correctly implemented as previously discussed

class GNN(torch.nn.Module):
    def __init__(self, num_features, num_protein_types, embedding_dim=10):
        super(GNN, self).__init__()
        self.conv1 = GATConv(num_features, 16, heads=8)
        self.conv2 = GATConv(16 * 8, 32)
        self.protein_embedding = torch.nn.Embedding(num_protein_types, embedding_dim)
        self.fc1 = torch.nn.Linear(32 + embedding_dim, 64)
        self.fc2 = torch.nn.Linear(64, 1)  # Output layer for regression

    def forward(self, data):
        x, edge_index, batch, protein_idx = data.x, data.edge_index, data.batch, data.protein_idx
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        x = global_mean_pool(x, batch)  # Graph-level features for each molecule in the batch
        protein_embed = self.protein_embedding(protein_idx)
        x = torch.cat([x, protein_embed], dim=1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# DataFrame setup, assume df and protein_to_idx are defined as you've done

# Convert DataFrame rows to graph data and include target values if available
#data_list = [molecule_to_graph(row['molecule_smiles'], row['protein_name'], protein_to_idx) for index, row in df.iterrows()]
loader = DataLoader(data_list, batch_size=32, shuffle=True)

# Instantiate the model
model = GNN(num_features=1, num_protein_types=3, embedding_dim=10)  
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

# Training Loop
epochs = 50  # Define the number of epochs
for epoch in range(epochs):
    model.train()
    for batch in loader:
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output, batch.y)  # Ensure batch.y is correctly set as target values
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1, Loss: 0.24480946362018585
Epoch 2, Loss: 0.25336194038391113
Epoch 3, Loss: 0.2429126501083374
Epoch 4, Loss: 0.2547247111797333
Epoch 5, Loss: 0.24511493742465973
Epoch 6, Loss: 0.24903278052806854
Epoch 7, Loss: 0.2483302652835846
Epoch 8, Loss: 0.24753382802009583
Epoch 9, Loss: 0.2521287202835083
Epoch 10, Loss: 0.2491285651922226
Epoch 11, Loss: 0.2520098388195038
Epoch 12, Loss: 0.24957674741744995
Epoch 13, Loss: 0.24408774077892303
Epoch 14, Loss: 0.2460491806268692
Epoch 15, Loss: 0.25408220291137695
Epoch 16, Loss: 0.24818657338619232
Epoch 17, Loss: 0.2490234524011612
Epoch 18, Loss: 0.24931392073631287
Epoch 19, Loss: 0.2509426176548004
Epoch 20, Loss: 0.2503223121166229
Epoch 21, Loss: 0.2528747022151947
Epoch 22, Loss: 0.24766354262828827
Epoch 23, Loss: 0.24757221341133118
Epoch 24, Loss: 0.2533261775970459
Epoch 25, Loss: 0.24662800133228302
Epoch 26, Loss: 0.2649485468864441
Epoch 27, Loss: 0.24842503666877747
Epoch 28, Loss: 0.2521907687187195
Epoch 29, Loss:

In [6]:
def molecule_to_graph(molecule_smiles, protein_feature):
    molecule = Chem.MolFromSmiles(molecule_smiles)
    atoms = molecule.GetAtoms()
    bonds = molecule.GetBonds()

    node_features = torch.tensor([atom.GetAtomicNum() for atom in atoms], dtype=torch.float).unsqueeze(1)
    edge_index = torch.tensor([(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) for bond in bonds], dtype=torch.long).t().contiguous()

    return Data(x=node_features, edge_index=edge_index, protein_features=protein_feature)
molecule_to_graph()

# Test

In [35]:
def test_molecule_to_graph(smiles, protein_name, protein_to_idx):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return None

    atoms = molecule.GetAtoms()
    bonds = molecule.GetBonds()

    # Node features: Atomic number
    node_features = [atom.GetAtomicNum() for atom in atoms]
    node_features = torch.tensor(node_features, dtype=torch.float).unsqueeze(1)
    
    # Edge indices and features
    edge_index = []
    edge_features = []
    for bond in bonds:
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index.extend([(start, end), (end, start)])
        edge_features.extend([bond.GetBondTypeAsDouble(), bond.GetBondTypeAsDouble()])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_features = torch.tensor(edge_features, dtype=torch.float).unsqueeze(1)

    protein_idx = protein_to_idx[protein_name]

    return Data(x=node_features, edge_index=edge_index, edge_attr=edge_features, protein_idx=protein_idx)


In [41]:

import os

train_path = './train.parquet'
test_path = './test.parquet'
# Process the test.parquet file chunk by chunk
test_file = './test.csv'
output_file = './submission.csv'  

# Read the test.parquet file into a pandas DataFrame
df_test = pd.read_csv(test_file)

# Create a mapping from protein names to indices
test_protein_to_idx = {name: idx for idx, name in enumerate(df_test['protein_name'].unique())}



  





In [43]:
# Convert DataFrame rows to graph data
test_data_list = [test_molecule_to_graph(row['molecule_smiles'], row['protein_name'], test_protein_to_idx) for index, row in df_test.iterrows()]

KeyboardInterrupt: 

In [45]:
import os
import pandas as pd
from torch_geometric.data import DataLoader
import torch

# Assuming your model is defined somewhere as `model`
train_path = './train.parquet'
test_path = './test.parquet'
output_file = './submission.csv'

# Read the test.parquet file into a pandas DataFrame
# If you really meant to use a CSV, ignore this conversion
if os.path.exists('./test.parquet'):
    df_test = pd.read_parquet(test_path)
else:
    df_test = pd.read_csv('./test.csv')  # Backup if parquet is not available

# Create a mapping from protein names to indices
protein_to_idx = {name: idx for idx, name in enumerate(df_test['protein_name'].unique())}

# Assuming test_molecule_to_graph is similar to your earlier defined function
def test_molecule_to_graph(smiles, protein_name, protein_to_idx):
    # Dummy implementation here; replace with your actual function
    return Data(x=torch.randn(50, 1), edge_index=torch.randint(0, 50, (2, 100)), protein_idx=protein_to_idx[protein_name])

# Convert DataFrame rows to graph data
test_data_list = [test_molecule_to_graph(row['molecule_smiles'], row['protein_name'], protein_to_idx) for index, row in df_test.iterrows()]

# Load data into DataLoader
test_loader = DataLoader(test_data_list, batch_size=32, shuffle=False)

# Set model to evaluation mode and predict
model.eval()
predictions = []

with torch.no_grad():  # Disable gradient computation
    for data in test_loader:
        output = model(data)
        # Assuming output needs to be sigmoid-transformed to represent probabilities
        predicted_probabilities = torch.sigmoid(output)
        predictions.extend(predicted_probabilities.detach().cpu().numpy())

# Prepare and save output DataFrame
output_df = pd.DataFrame({
    'id': df_test['id'],
    'binds': [prob[0] for prob in predictions]  # Flatten probabilities if necessary
})
output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))




In [39]:


from torch_geometric.data import DataLoader
import torch
test_loader = DataLoader(test_data_list, batch_size=32, shuffle=False)


model.eval()  # Set the model to evaluation mode
predictions = []

with torch.no_grad():  # Disable gradient computation
    for data in test_loader:
        output = model(data)  # Make predictions
        #predicted_probabilities = torch.sigmoid(output)  # Apply sigmoid to get probabilities if your model outputs logits
        predictions.extend(output)  # 

import pandas as pd

output_df = pd.DataFrame({
    'id': df_test['id'],  # Ensure df_test has an 'id' column
    'binds': predictions
})

# Specify the path to your output file
output_file = 'submission.csv'

# Save to CSV, appending if file exists, otherwise write new file with header
import os
output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))



ValueError: array length 60000 does not match index length 1674896

In [4]:
from pysmiles import read_smiles
import networkx as nx
    
smiles = 'C12=C3C4=C5C6=C1C7=C8C9=C1C%10=C%11C(=C29)C3=C2C3=C4C4=C5C5=C9C6=C7C6=C7C8=C1C1=C8C%10=C%10C%11=C2C2=C3C3=C4C4=C5C5=C%11C%12=C(C6=C95)C7=C1C1=C%12C5=C%11C4=C3C3=C5C(=C81)C%10=C23'
mol = read_smiles(smiles)
    
# atom vector (C only)
print(mol.nodes(data='element'))
# adjacency matrix
print(nx.to_numpy_matrix(mol))

[(0, 'C'), (1, 'C'), (2, 'C'), (3, 'C'), (4, 'C'), (5, 'C'), (6, 'C'), (7, 'C'), (8, 'C'), (9, 'C'), (10, 'C'), (11, 'C'), (12, 'C'), (13, 'C'), (14, 'C'), (15, 'C'), (16, 'C'), (17, 'C'), (18, 'C'), (19, 'C'), (20, 'C'), (21, 'C'), (22, 'C'), (23, 'C'), (24, 'C'), (25, 'C'), (26, 'C'), (27, 'C'), (28, 'C'), (29, 'C'), (30, 'C'), (31, 'C'), (32, 'C'), (33, 'C'), (34, 'C'), (35, 'C'), (36, 'C'), (37, 'C'), (38, 'C'), (39, 'C'), (40, 'C'), (41, 'C'), (42, 'C'), (43, 'C'), (44, 'C'), (45, 'C'), (46, 'C'), (47, 'C'), (48, 'C'), (49, 'C'), (50, 'C'), (51, 'C'), (52, 'C'), (53, 'C'), (54, 'C'), (55, 'C'), (56, 'C'), (57, 'C'), (58, 'C'), (59, 'C')]
[[0. 1. 0. ... 0. 0. 0.]
 [1. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 1. 0.]]


In [33]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data, DataLoader

# Model definition
class MoleculeProteinGNN(torch.nn.Module):
    def __init__(self, num_node_features, num_protein_features, num_classes):
        super(MoleculeProteinGNN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 128)
        self.conv2 = GCNConv(128, 64)
        self.fc1 = torch.nn.Linear(64 + num_protein_features, 128)
        self.fc2 = torch.nn.Linear(128, num_classes)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        protein_features = data.protein_features
        if protein_features.dim() == 1:
            protein_features = protein_features.unsqueeze(0)  # Ensure it is 2D
        x = torch.cat((x, protein_features), dim=1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Example Data preparation
# Assume data_list is correctly prepared and contains Data objects with .x, .edge_index, .batch, and .protein_features
train_loader = DataLoader(data_list[:int(0.8 * len(data_list))], batch_size=32, shuffle=True)
test_loader = DataLoader(data_list[int(0.8 * len(data_list)):], batch_size=32, shuffle=False)

# Model instantiation
model = MoleculeProteinGNN(num_node_features=1, num_protein_features=3, num_classes=1)  # Adjust as per actual feature counts

# Loss and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
model.train()
for data in train_loader:
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, data.y.float().unsqueeze(1))  # Match output dimensions
    loss.backward()
    optimizer.step()

# Evaluation loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        output = model(data)
        predictions = (torch.sigmoid(output) > 0.5).float()  # Convert logits to binary predictions
        correct += (predictions == data.y.unsqueeze(1)).sum().item()  # Ensure dimensions match
        total += data.y.size(0)

accuracy = 100 * correct / total
print(f'Accuracy: {accuracy:.2f}%')


RuntimeError: Tensors must have same number of dimensions: got 2 and 3

In [27]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 32)
        self.fc = torch.nn.Linear(32, 1)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)  # Average pooling
        x = self.fc(x)
        return torch.sigmoid(x)

In [28]:
data_list = [Data(...)]  # Create a list of Data objects from your dataset
loader = DataLoader(data_list, batch_size=32, shuffle=True)

# Model, optimizer, and loss function
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCELoss()

# Training loop
model.train()
for epoch in range(200):
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch}, Loss: {loss.item()}')

NameError: name 'num_node_features' is not defined

In [None]:
import os

# Process the test.parquet file chunk by chunk
test_file = './test.csv'
output_file = 'submission.csv'  # Specify the path and filename for the output file

# Read the test.parquet file into a pandas DataFrame
for df_test in pd.read_csv(test_file, chunksize=100000):

    # Generate ECFPs for the molecule_smiles
    df_test['molecule'] = df_test['molecule_smiles'].apply(Chem.MolFromSmiles)
    df_test['ecfp'] = df_test['molecule'].apply(smiles_to_ecfp)

    # One-hot encode the protein_name
    protein_onehot =  OneHotEncoder(sparse_output=False).transform(df_test['protein_name'].values.reshape(-1, 1))

    # Combine ECFPs and one-hot encoded protein_name
    X_test = [ecfp + protein for ecfp, protein in zip(df_test['ecfp'].tolist(), protein_onehot.tolist())]

    # Predict the probabilities
    probabilities = model.predict_proba(X_test)[:, 1]

    # Create a DataFrame with 'id' and 'probability' columns
    output_df = pd.DataFrame({'id': df_test['id'], 'binds': probabilities})

    # Save the output DataFrame to a CSV file
    output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))