In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import duckdb
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder

In [None]:
train_path = './train.parquet'
test_path = './test.parquet'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 0
                        ORDER BY random()
                        LIMIT 30000)
                        UNION ALL
                        (SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT 30000)""").df()

con.close()

In [None]:
df.head()

# Preprocess

In [None]:
from rdkit import Chem
import torch
from torch_geometric.data import Data

def molecule_to_graph(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return None

    atoms = molecule.GetAtoms()
    bonds = molecule.GetBonds()

    node_features = torch.tensor([atom.GetAtomicNum() for atom in atoms], dtype=torch.float).unsqueeze(1)
    
    edge_index = []
    edge_features = []
    for bond in bonds:
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        bond_type = bond.GetBondTypeAsDouble()
        # Append forward and backward directions
        edge_index.extend([(start, end), (end, start)])
        edge_features.extend([bond_type, bond_type])  # Same feature for both directions

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_features = torch.tensor(edge_features, dtype=torch.float).unsqueeze(1)  # Ensure it has the right shape

    return Data(x=node_features, edge_index=edge_index, edge_attr=edge_features)




In [None]:
# Example usage
smiles = 'CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1'
graph_data = molecule_to_graph(smiles)
if graph_data:
    print(graph_data)
else:
    print("Invalid SMILES string.")

In [None]:
def molecule_to_graph(smiles, protein_name, protein_to_idx, target_value):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return None

    atoms = molecule.GetAtoms()
    bonds = molecule.GetBonds()

    # Node features: Atomic number
    node_features = [atom.GetAtomicNum() for atom in atoms]
    node_features = torch.tensor(node_features, dtype=torch.float).unsqueeze(1)
    
    # Edge indices and features
    edge_index = []
    edge_features = []
    for bond in bonds:
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index.extend([(start, end), (end, start)])
        edge_features.extend([bond.GetBondTypeAsDouble(), bond.GetBondTypeAsDouble()])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_features = torch.tensor(edge_features, dtype=torch.float).unsqueeze(1)

    protein_idx = protein_to_idx[protein_name]

    return Data(x=node_features, edge_index=edge_index, edge_attr=edge_features, protein_idx=protein_idx, y=torch.tensor([target_value], dtype=torch.float))


In [None]:
# Assuming 'df' has a column 'molecule_smiles' containing SMILES strings
protein_to_idx = {name: idx for idx, name in enumerate(df['protein_name'].unique())}
# Assuming df has a 'binds' column with target values
data_list = [
    molecule_to_graph(row['molecule_smiles'], row['protein_name'], protein_to_idx, row['binds'])
    for index, row in df.iterrows()
]



In [None]:

# Example DataFrame setup
df = pd.DataFrame({
    'molecule_smiles': ['CCO', 'CCC', 'CCN'],  # example SMILES
    'protein_name': ['sEH', 'BRD4', 'HSA']
})

# Create a mapping from protein names to indices
protein_to_idx = {name: idx for idx, name in enumerate(df['protein_name'].unique())}

# Convert DataFrame rows to graph data
data_list = [molecule_to_graph(row['molecule_smiles'], row['protein_name'], protein_to_idx) for index, row in df.iterrows()]

In [None]:
data_list[:5]

In [None]:
import torch
import pandas as pd
from rdkit import Chem
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GATConv, global_mean_pool

# Assuming molecule_to_graph is correctly implemented as previously discussed

class GNN(torch.nn.Module):
    def __init__(self, num_features, num_protein_types, embedding_dim=10):
        super(GNN, self).__init__()
        self.conv1 = GATConv(num_features, 16, heads=8)
        self.conv2 = GATConv(16 * 8, 32)
        self.protein_embedding = torch.nn.Embedding(num_protein_types, embedding_dim)
        self.fc1 = torch.nn.Linear(32 + embedding_dim, 64)
        self.fc2 = torch.nn.Linear(64, 1)  # Output layer for regression

    def forward(self, data):
        x, edge_index, batch, protein_idx = data.x, data.edge_index, data.batch, data.protein_idx
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        x = global_mean_pool(x, batch)  # Graph-level features for each molecule in the batch
        protein_embed = self.protein_embedding(protein_idx)
        x = torch.cat([x, protein_embed], dim=1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# DataFrame setup, assume df and protein_to_idx are defined as you've done

# Convert DataFrame rows to graph data and include target values if available
#data_list = [molecule_to_graph(row['molecule_smiles'], row['protein_name'], protein_to_idx) for index, row in df.iterrows()]
loader = DataLoader(data_list, batch_size=32, shuffle=True)

# Instantiate the model
model = GNN(num_features=1, num_protein_types=3, embedding_dim=10)  
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

# Training Loop
epochs = 30  # Define the number of epochs
for epoch in range(epochs):
    model.train()
    for batch in loader:
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output, batch.y)  # Ensure batch.y is correctly set as target values
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


# Test

In [None]:
def test_molecule_to_graph(smiles, protein_name, protein_to_idx):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return None

    atoms = molecule.GetAtoms()
    bonds = molecule.GetBonds()

    # Node features: Atomic number
    node_features = [atom.GetAtomicNum() for atom in atoms]
    node_features = torch.tensor(node_features, dtype=torch.float).unsqueeze(1)
    
    # Edge indices and features
    edge_index = []
    edge_features = []
    for bond in bonds:
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index.extend([(start, end), (end, start)])
        edge_features.extend([bond.GetBondTypeAsDouble(), bond.GetBondTypeAsDouble()])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_features = torch.tensor(edge_features, dtype=torch.float).unsqueeze(1)

    protein_idx = protein_to_idx[protein_name]

    return Data(x=node_features, edge_index=edge_index, edge_attr=edge_features, protein_idx=protein_idx)


In [None]:
import os
import pandas as pd
from torch_geometric.data import DataLoader
import torch

# Assuming your model is defined somewhere as `model`
train_path = './train.parquet'
test_path = './test.parquet'
output_file = './submission.csv'

# Read the test.parquet file into a pandas DataFrame
# If you really meant to use a CSV, ignore this conversion
if os.path.exists('./test.parquet'):
    df_test = pd.read_parquet(test_path)
else:
    df_test = pd.read_csv('./test.csv')  # Backup if parquet is not available

# Create a mapping from protein names to indices
protein_to_idx = {name: idx for idx, name in enumerate(df_test['protein_name'].unique())}


# Convert DataFrame rows to graph data
test_data_list = [test_molecule_to_graph(row['molecule_smiles'], row['protein_name'], protein_to_idx) for index, row in df_test.iterrows()]




In [None]:
import os
import pandas as pd
from torch_geometric.data import DataLoader
import torch
# Load data into DataLoader
test_loader = DataLoader(test_data_list, batch_size=32, shuffle=False)

# Set model to evaluation mode and predict
model.eval()
predictions = []

with torch.no_grad():  # Disable gradient computation
    for data in test_loader:
        output = model(data)
        # Assuming output needs to be sigmoid-transformed to represent probabilities
        predicted_probabilities = torch.sigmoid(output)
        predictions.extend(predicted_probabilities.detach().cpu().numpy())

# Prepare and save output DataFrame
output_df = pd.DataFrame({
    'id': df_test['id'],
    'binds': [prob[0] for prob in predictions]  # Flatten probabilities if necessary
})
output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))

In [None]:


from torch_geometric.data import DataLoader
import torch
test_loader = DataLoader(test_data_list, batch_size=32, shuffle=False)


model.eval()  # Set the model to evaluation mode
predictions = []

with torch.no_grad():  # Disable gradient computation
    for data in test_loader:
        output = model(data)  # Make predictions
        #predicted_probabilities = torch.sigmoid(output)  # Apply sigmoid to get probabilities if your model outputs logits
        predictions.extend(output)  # 

import pandas as pd

output_df = pd.DataFrame({
    'id': df_test['id'],  # Ensure df_test has an 'id' column
    'binds': predictions
})

# Specify the path to your output file
output_file = 'submission.csv'

# Save to CSV, appending if file exists, otherwise write new file with header
import os
output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))

In [None]:
from pysmiles import read_smiles
import networkx as nx
    
smiles = 'C12=C3C4=C5C6=C1C7=C8C9=C1C%10=C%11C(=C29)C3=C2C3=C4C4=C5C5=C9C6=C7C6=C7C8=C1C1=C8C%10=C%10C%11=C2C2=C3C3=C4C4=C5C5=C%11C%12=C(C6=C95)C7=C1C1=C%12C5=C%11C4=C3C3=C5C(=C81)C%10=C23'
mol = read_smiles(smiles)
    
# atom vector (C only)
print(mol.nodes(data='element'))
# adjacency matrix
print(nx.to_numpy_matrix(mol))

In [None]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 32)
        self.fc = torch.nn.Linear(32, 1)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)  # Average pooling
        x = self.fc(x)
        return torch.sigmoid(x)

In [None]:
data_list = [Data(...)]  # Create a list of Data objects from your dataset
loader = DataLoader(data_list, batch_size=32, shuffle=True)

# Model, optimizer, and loss function
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCELoss()

# Training loop
model.train()
for epoch in range(20):
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch}, Loss: {loss.item()}')