In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import duckdb
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder

In [2]:
train_path = './train.parquet'
test_path = './test.parquet'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 0
                        ORDER BY random()
                        LIMIT 30000)
                        UNION ALL
                        (SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT 30000)""").df()

con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [3]:
df.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,167079197,O=C(Nc1cc(C(=O)O)ccc1Cl)OCC1c2ccccc2-c2ccccc21,Cc1ccc(N)c(CO)c1,Cn1ccc(N)n1,Cc1ccc(Nc2nc(Nc3ccn(C)n3)nc(Nc3cc(C(=O)N[Dy])c...,sEH,0
1,172823830,O=C(Nc1cc(F)c(Br)cc1C(=O)O)OCC1c2ccccc2-c2ccccc21,Cn1ccc2cc(N)ccc21,CO[C@@H]1COC[C@H]1n1cc(CN)nn1.Cl,CO[C@@H]1COC[C@H]1n1cc(CNc2nc(Nc3ccc4c(ccn4C)c...,HSA,0
2,125278635,O=C(N[C@H](Cc1ccc(I)cc1)C(=O)O)OCC1c2ccccc2-c2...,Nc1nncs1,CC(O)CCN,CC(O)CCNc1nc(Nc2nncs2)nc(N[C@H](Cc2ccc(I)cc2)C...,BRD4,0
3,105806648,O=C(N[C@@H](Cc1cccs1)C(=O)O)OCC1c2ccccc2-c2ccc...,CCSCCN.Cl,Nc1cc(Br)cn2ccnc12,CCSCCNc1nc(Nc2cc(Br)cn3ccnc23)nc(N[C@@H](Cc2cc...,sEH,0
4,261419929,O=C(O)C[C@@H](NC(=O)OCC1c2ccccc2-c2ccccc21)c1c...,Nc1nc(Cl)cc(Cl)n1,Cl.NCC(=O)NC1CCC1,O=C(C[C@@H](Nc1nc(NCC(=O)NC2CCC2)nc(Nc2nc(Cl)c...,HSA,0


preprocess

In [4]:
#code from GPT
def compute_ecfp(smiles_string):
    mol = Chem.MolFromSmiles(smiles_string)
    if mol is not None:  # Ensure the molecule could be parsed
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        return list(fp)
    return [0]*2048  # Return a zero vector if the molecule could not be parsed

# Apply the function to the 'molecule_smiles' column
df['ecfp'] = df['molecule_smiles'].apply(compute_ecfp)

KeyboardInterrupt: 

In [None]:
df.head()

In [15]:
# One-hot encode the protein_name
onehot_encoder = OneHotEncoder(sparse_output=False)
protein_onehot = onehot_encoder.fit_transform(df['protein_name'].values.reshape(-1, 1))

In [19]:
from rdkit import Chem
import torch
from torch_geometric.data import Data

def molecule_to_graph(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return None

    atoms = molecule.GetAtoms()
    bonds = molecule.GetBonds()

    # Node features: Atomic number
    node_features = [atom.GetAtomicNum() for atom in atoms]
    node_features = torch.tensor(node_features, dtype=torch.float).unsqueeze(1)  # Unsqueeze for feature dimension
    
    # Edge indices and features
    edge_index = []
    edge_features = []
    for bond in bonds:
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index.append((start, end))
        edge_index.append((end, start))  # Because graphs are undirected
        edge_features.append(bond.GetBondTypeAsDouble())  # Bond type as double

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_features = torch.tensor(edge_features, dtype=torch.float)

    combined_features = torch.tensor(protein_onehot.tolist(), dtype=torch.float)

    return Data(x=node_features, edge_index=edge_index, edge_attr=edge_features, u=combined_features)


Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,ecfp
0,93094066,O=C(N[C@@H](Cc1ccc(Br)cc1)C(=O)O)OCC1c2ccccc2-...,Cl.Cl.NCc1ccc(-n2cncn2)cc1,Cl.NCC1CC2(C1)CC2(F)F,O=C(N[Dy])[C@H](Cc1ccc(Br)cc1)Nc1nc(NCc2ccc(-n...,HSA,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,6164479,C=CCC(CC=C)(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,N#Cc1ccnc(N)c1,Cl.NCC1(O)C2C3CC4C5C3CC2C5C41,C=CCC(CC=C)(Nc1nc(NCC2(O)C3C4CC5C6C4CC3C6C52)n...,HSA,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,24243502,CCc1cccc(NC(=O)OCC2c3ccccc3-c3ccccc32)c1C(=O)O,COc1ccc([N+](=O)[O-])c(N)n1,Cl.NC1CCC(=O)CC1,CCc1cccc(Nc2nc(Nc3nc(OC)ccc3[N+](=O)[O-])nc(NC...,HSA,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,255524911,O=C(O)C[C@@H](NC(=O)OCC1c2ccccc2-c2ccccc21)c1c...,Cl.NCCC1CCCC1(F)F,NCc1ccsc1,O=C(C[C@@H](Nc1nc(NCCC2CCCC2(F)F)nc(NCc2ccsc2)...,HSA,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,95205929,O=C(N[C@@H](Cc1ccc(Cl)c(Cl)c1)C(=O)O)OCC1c2ccc...,Cc1cnc(N)cn1,Nc1nc(-c2ccc(Cl)c(Cl)c2)cs1,Cc1cnc(Nc2nc(Nc3nc(-c4ccc(Cl)c(Cl)c4)cs3)nc(N[...,sEH,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [20]:
# Assuming 'df' has a column 'molecule_smiles' containing SMILES strings
data_list = [molecule_to_graph(smiles) for smiles in df['molecule_smiles'] if molecule_to_graph(smiles) is not None]


KeyboardInterrupt: 

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,ecfp
0,93094066,O=C(N[C@@H](Cc1ccc(Br)cc1)C(=O)O)OCC1c2ccccc2-...,Cl.Cl.NCc1ccc(-n2cncn2)cc1,Cl.NCC1CC2(C1)CC2(F)F,O=C(N[Dy])[C@H](Cc1ccc(Br)cc1)Nc1nc(NCc2ccc(-n...,HSA,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,6164479,C=CCC(CC=C)(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,N#Cc1ccnc(N)c1,Cl.NCC1(O)C2C3CC4C5C3CC2C5C41,C=CCC(CC=C)(Nc1nc(NCC2(O)C3C4CC5C6C4CC3C6C52)n...,HSA,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,24243502,CCc1cccc(NC(=O)OCC2c3ccccc3-c3ccccc32)c1C(=O)O,COc1ccc([N+](=O)[O-])c(N)n1,Cl.NC1CCC(=O)CC1,CCc1cccc(Nc2nc(Nc3nc(OC)ccc3[N+](=O)[O-])nc(NC...,HSA,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,255524911,O=C(O)C[C@@H](NC(=O)OCC1c2ccccc2-c2ccccc21)c1c...,Cl.NCCC1CCCC1(F)F,NCc1ccsc1,O=C(C[C@@H](Nc1nc(NCCC2CCCC2(F)F)nc(NCc2ccsc2)...,HSA,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,95205929,O=C(N[C@@H](Cc1ccc(Cl)c(Cl)c1)C(=O)O)OCC1c2ccc...,Cc1cnc(N)cn1,Nc1nc(-c2ccc(Cl)c(Cl)c2)cs1,Cc1cnc(Nc2nc(Nc3nc(-c4ccc(Cl)c(Cl)c4)cs3)nc(N[...,sEH,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [7]:

# Example GNN model
class GNNModel(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return torch.sigmoid(x)

# Assume 'graph_data_list' is a list of Data objects prepared as per above instructions
train_loader = DataLoader(data_list[:int(0.8*len(data_list))], batch_size=32, shuffle=True)
test_loader = DataLoader(data_list[int(0.8*len(data_list)):], batch_size=32, shuffle=False)

model = GNNModel(num_node_features=34, num_classes=1)  # adjust the feature size accordingly
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCELoss()

# Training loop
model.train()
for data in train_loader:
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, data.y)
    loss.backward()
    optimizer.step()

# Evaluation loop
model.eval()
with torch.no_grad():
    for data in test_loader:
        output = model(data)
        # calculate metrics here

# Add your metric calculation, like mean average precision


NameError: name 'graph_data_list' is not defined

In [None]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 32)
        self.fc = torch.nn.Linear(32, 1)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)  # Average pooling
        x = self.fc(x)
        return torch.sigmoid(x)

In [None]:
data_list = [Data(...)]  # Create a list of Data objects from your dataset
loader = DataLoader(data_list, batch_size=32, shuffle=True)

# Model, optimizer, and loss function
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCELoss()

# Training loop
model.train()
for epoch in range(200):
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch}, Loss: {loss.item()}')

In [None]:
import os

# Process the test.parquet file chunk by chunk
test_file = './test.csv'
output_file = 'submission.csv'  # Specify the path and filename for the output file

# Read the test.parquet file into a pandas DataFrame
for df_test in pd.read_csv(test_file, chunksize=100000):

    # Generate ECFPs for the molecule_smiles
    df_test['molecule'] = df_test['molecule_smiles'].apply(Chem.MolFromSmiles)
    df_test['ecfp'] = df_test['molecule'].apply(smiles_to_ecfp)

    # One-hot encode the protein_name
    protein_onehot =  OneHotEncoder(sparse_output=False).transform(df_test['protein_name'].values.reshape(-1, 1))

    # Combine ECFPs and one-hot encoded protein_name
    X_test = [ecfp + protein for ecfp, protein in zip(df_test['ecfp'].tolist(), protein_onehot.tolist())]

    # Predict the probabilities
    probabilities = model.predict_proba(X_test)[:, 1]

    # Create a DataFrame with 'id' and 'probability' columns
    output_df = pd.DataFrame({'id': df_test['id'], 'binds': probabilities})

    # Save the output DataFrame to a CSV file
    output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))