<a href="https://colab.research.google.com/github/sam02111972/deepchem_sam/blob/main/tox_fin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Toxicity_Final
# Install dependencies
!pip install torch torch-geometric deepchem rdkit-pypi numpy pandas

# Import libraries
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
import pandas as pd
from rdkit import Chem
from deepchem.feat import MolGraphConvFeaturizer
from sklearn.model_selection import train_test_split

# Load custom dataset
df = pd.read_csv('/content/combined_toxicity_data (3).csv')
smiles_list = df['string'].tolist() # Changed 'smiles' to 'string'
labels = df['label'].values # Changed 'toxicity_label' to 'label'

# Convert SMILES to graphs
featurizer = MolGraphConvFeaturizer()
geometric_data = []
for smile, label in zip(smiles_list, labels):
    mol = Chem.MolFromSmiles(smile)
    if mol is not None:
        try:
            graph = featurizer.featurize(mol)[0]
            edge_index = torch.tensor(graph.edge_index, dtype=torch.long)
            x = torch.tensor(graph.node_features, dtype=torch.float)
            y = torch.tensor([[label]], dtype=torch.float)  # Match shape [batch=1, tasks=1]
            geometric_data.append(Data(x=x, edge_index=edge_index, y=y))
        except:
            continue

# Split data
train_data, test_data = train_test_split(geometric_data, test_size=0.2, random_state=42)

# Define GNN Model
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels=64):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(train_data[0].num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.lin = torch.nn.Linear(hidden_channels, 1)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = global_mean_pool(x, batch)
        x = self.lin(x)
        return torch.sigmoid(x)

# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCELoss()

# Data loaders
train_loader = DataLoader(train_data, batch_size=2, shuffle=True)
test_loader = DataLoader(test_data, batch_size=2, shuffle=False)

# Training loop
def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
    return total_loss / len(train_loader.dataset)

# Testing loop
def test(loader):
    model.eval()
    total_correct = 0
    for data in loader:
        data = data.to(device)
        with torch.no_grad():
            pred = model(data)
        pred_class = (pred > 0.5).float()
        total_correct += (pred_class == data.y).sum().item()
    return total_correct / len(loader.dataset)

# Train and test
for epoch in range(1, 101):
    loss = train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

# Prediction function
def predict_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    graph = featurizer.featurize(mol)[0]
    data = Data(
        x=torch.tensor(graph.node_features, dtype=torch.float),
        edge_index=torch.tensor(graph.edge_index, dtype=torch.long)
    ).to(device)
    model.eval()
    with torch.no_grad():
        prediction = model(data)
    return prediction.item()

# Example prediction
aspirin_toxicity = predict_smiles('CC(=O)OC1=C(C=CC=C1)C(=O)O')
print(f"\nAspirin Toxicity Probability: {aspirin_toxicity:.4f}")