In [43]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdmolops
import torch
from torch_geometric.data import Data, DataLoader
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [44]:
# Load the CSV file
data = pd.read_csv('allmurge.csv')

In [45]:
# Function to normalize node features
def normalize_features(features):
    return (features - features.mean(axis=0)) / features.std(axis=0)

In [46]:
# Function to convert SMILES to a molecular graph and normalize node features
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    adjacency_matrix = rdmolops.GetAdjacencyMatrix(mol)
    atom_features = []
    for atom in mol.GetAtoms():
        atom_features.append([atom.GetAtomicNum()])

    atom_features = torch.tensor(atom_features, dtype=torch.float)
    atom_features = normalize_features(atom_features)
    
    return adjacency_matrix, atom_features


In [47]:
adjacency_matrices = []
node_features = []
for smiles in data['PUBCHEM_EXT_DATASOURCE_SMILES']:
    adj_matrix, features = smiles_to_graph(smiles)
    adjacency_matrices.append(adj_matrix)
    node_features.append(features)

data['ADJACENCY_MATRIX'] = adjacency_matrices
data['NODE_FEATURES'] = node_features



In [48]:
label_encoder = LabelEncoder()
data['PUBCHEM_ACTIVITY_OUTCOME'] = label_encoder.fit_transform(data['PUBCHEM_ACTIVITY_OUTCOME'])

scaler = MinMaxScaler()
data['PUBCHEM_ACTIVITY_SCORE'] = scaler.fit_transform(data[['PUBCHEM_ACTIVITY_SCORE']])


In [49]:
# Save the preprocessed data to a new CSV file (optional)
data.to_csv('./mnf.csv')

In [50]:
# Split the data (7:2:1 ratio)
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=1/3, random_state=42)

In [51]:
# Function to create a list of graph data
def create_graph_data_list(df):
    graph_data_list = []
    for index, row in df.iterrows():
        adj_matrix = row['ADJACENCY_MATRIX']
        features = row['NODE_FEATURES']
        
        edge_index = torch.tensor(adj_matrix.nonzero(), dtype=torch.long)
        edge_index = edge_index.view(2, -1)
        
        y = torch.tensor([row['PUBCHEM_ACTIVITY_OUTCOME'], row['PUBCHEM_ACTIVITY_SCORE']], dtype=torch.float)
        
        graph_data = Data(x=features, edge_index=edge_index, y=y)
        graph_data_list.append(graph_data)
    
    return graph_data_list

In [52]:
data

Unnamed: 0,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,MESH,ADJACENCY_MATRIX,NODE_FEATURES
0,2962067.0,C1=CC=C(C=C1)CCC(=O)NC(=S)NC2=CC=CC=C2[N+](=O)...,1,0.600746,D016773,"[[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[tensor(-0.3888)], [tensor(-0.3888)], [tensor..."
1,4547230.0,CC1=C(C=C(C=C1)NS(=O)(=O)C2=CC3=C(C=C2)NC(=O)C...,1,0.600746,D016773,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[tensor(-0.3672)], [tensor(-0.3672)], [tensor..."
2,22517460.0,CC1=C(C=CC(=C1)F)S(=O)(=O)NCC(C2=CN=CC=C2)N3CC...,1,0.600746,D016773,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[tensor(-0.3541)], [tensor(-0.3541)], [tensor..."
3,3964482.0,C1CCC(=NNC(=O)CN2C3C(NC(=O)N3)NC2=O)C1,1,0.600746,D016773,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[tensor(-0.7958)], [tensor(-0.7958)], [tensor..."
4,3825783.0,C1=CC(=CC(=C1)C(F)(F)F)CNC(=O)CSCC(=O)O,1,0.600746,D016773,"[[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[tensor(-0.5499)], [tensor(-0.5499)], [tensor..."
...,...,...,...,...,...,...,...
927585,658781.0,CC(=O)N=C(N)SC1=NC2=CC=CC=C2N1,1,0.600746,D014353,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[tensor(-0.4038)], [tensor(-0.4038)], [tensor..."
927586,1952761.0,COC1=C(C2=C(C=C1)C=NN(C2=O)C3=CC=C(C=C3)OC(F)(...,1,0.600746,D014353,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[tensor(-0.6550)], [tensor(1.1377)], [tensor(..."
927587,5737602.0,CC1=CC=C(C=C1)C(=O)N/C(=C\C2=CC(=CC=C2)[N+](=O...,1,0.600746,D014353,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[tensor(-0.5724)], [tensor(-0.5724)], [tensor..."
927588,5736918.0,COC1=CC=C(C=C1)/C=C/C(=O)NCCCN2C=CN=C2,1,0.600746,D014353,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[tensor(-0.5064)], [tensor(2.5318)], [tensor(..."


In [53]:
# Function to create a list of graph data
def create_graph_data_list(df):
    graph_data_list = []
    for index, row in df.iterrows():
        adj_matrix = row['ADJACENCY_MATRIX']
        features = row['NODE_FEATURES']
        
        edge_index = torch.tensor(adj_matrix.nonzero(), dtype=torch.long)
        edge_index = edge_index.view(2, -1)
        
        y = torch.tensor([row['PUBCHEM_ACTIVITY_OUTCOME'], row['PUBCHEM_ACTIVITY_SCORE']], dtype=torch.float)
        
        graph_data = Data(x=features, edge_index=edge_index, y=y)
        graph_data_list.append(graph_data)
    
    return graph_data_list

In [54]:
# Create graph data lists for each dataset
train_graph_data = create_graph_data_list(train_data)
val_graph_data = create_graph_data_list(val_data)
test_graph_data = create_graph_data_list(test_data)


In [55]:
# Create DataLoaders
train_loader = DataLoader(train_graph_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_graph_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_graph_data, batch_size=32, shuffle=False)




In [56]:
import torch
from torch.nn import Linear
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.nn.functional as F

In [57]:

class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = Linear(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)

        # Pooling layer to get graph-level representation
        x = global_mean_pool(x, batch)

        x = self.fc(x)
        return x

In [58]:
# Ensure input_dim, hidden_dim, and output_dim are defined
input_dim = train_graph_data[0].x.shape[1]  # Number of input features
hidden_dim = 16  # Hidden layer dimension
output_dim = 2  # Assuming 2 outputs: PUBCHEM_ACTIVITY_OUTCOME and PUBCHEM_ACTIVITY_SCORE

model = GCN(input_dim, hidden_dim, output_dim)

In [62]:
# Training loop
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
model.train()
for epoch in range(200):
    for batch in train_loader:
        optimizer.zero_grad()
        out = model(batch)
        
        # Ensure the output and target shapes match
        target = batch.y.view(-1, output_dim)
        
        loss = F.mse_loss(out, target)  # Predict both activation outcome and score
        loss.backward()
        optimizer.step()
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            out = model(batch)
            target = batch.y.view(-1, output_dim)
            val_loss += F.mse_loss(out, target).item()
    val_loss /= len(val_loader)
    
    print(f'Epoch {epoch}, Train Loss: {loss.item()}, Validation Loss: {val_loss}')


Epoch 0, Train Loss: nan, Validation Loss: nan
Epoch 1, Train Loss: nan, Validation Loss: nan
Epoch 2, Train Loss: nan, Validation Loss: nan
Epoch 3, Train Loss: nan, Validation Loss: nan
Epoch 4, Train Loss: nan, Validation Loss: nan
Epoch 5, Train Loss: nan, Validation Loss: nan
Epoch 6, Train Loss: nan, Validation Loss: nan
Epoch 7, Train Loss: nan, Validation Loss: nan
Epoch 8, Train Loss: nan, Validation Loss: nan
Epoch 9, Train Loss: nan, Validation Loss: nan
Epoch 10, Train Loss: nan, Validation Loss: nan
Epoch 11, Train Loss: nan, Validation Loss: nan
Epoch 12, Train Loss: nan, Validation Loss: nan
Epoch 13, Train Loss: nan, Validation Loss: nan
Epoch 14, Train Loss: nan, Validation Loss: nan
Epoch 15, Train Loss: nan, Validation Loss: nan
Epoch 16, Train Loss: nan, Validation Loss: nan
Epoch 17, Train Loss: nan, Validation Loss: nan
Epoch 18, Train Loss: nan, Validation Loss: nan
Epoch 19, Train Loss: nan, Validation Loss: nan
Epoch 20, Train Loss: nan, Validation Loss: nan
Ep

In [65]:
model.eval()
test_loss = 0
with torch.no_grad():
    for batch in test_loader:
        out = model(batch)
        target = batch.y.view(out.size(0), -1)
        test_loss += F.mse_loss(out, target).item()
test_loss /= len(test_loader)
print(f'Test Loss: {test_loss}')

Test Loss: nan
