In [None]:
import torch
import torch.nn.functional as F
import numpy as np
from pubchempy import get_compounds
from rdkit import Chem
from rdkit.Chem import rdmolops
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data, DataLoader
import pandas as pd

# Load SIDER side effects data from meddra_all_se.tsv
def load_sider_data(path):
    df = pd.read_csv(path, sep='\t', header=None, names=['stitch_id', 'umls_id', 'side_effect'])
    return df

# Fetch PubChem data
def fetch_pubchem_data(drug_name):
    compounds = get_compounds(drug_name, 'name')
    if not compounds:  # Check if the list is empty
        print(f"Warning: No PubChem data found for {drug_name}")
        return None
    smiles = compounds[0].isomeric_smiles
    return smiles

# Generate molecular graphs
def generate_molecular_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError("Invalid SMILES string")
    
    adj = rdmolops.GetAdjacencyMatrix(mol)
    edge_index = torch.from_numpy(np.array(adj.nonzero(), dtype=np.int64))
    x = torch.tensor([atom.GetAtomicNum() for atom in mol.GetAtoms()], dtype=torch.float).view(-1, 1)
    return Data(x=x, edge_index=edge_index)

# Preprocess data
def preprocess_data(sider_path, drug_names_path):
    sider_df = load_sider_data(sider_path)
    drug_names = pd.read_csv(drug_names_path, sep='\t', header=None, names=['stitch_id', 'drug_name'])
    data_list = []

    for _, row in drug_names.iterrows():
        try:
            smiles = fetch_pubchem_data(row['drug_name'])
            if smiles is None:  # Skip if no valid SMILES found
                continue
            side_effects = sider_df[sider_df['stitch_id'] == row['stitch_id']]['side_effect'].values
            graph = generate_molecular_graph(smiles)
            graph.y = torch.tensor([len(side_effects)], dtype=torch.float)  # Using count of side effects
            data_list.append(graph)
        except Exception as e:
            print(f"Error processing {row['drug_name']}: {e}")
    return data_list

# Define GNN model
class DrugGNN(torch.nn.Module):
    def __init__(self):
        super(DrugGNN, self).__init__()
        self.conv1 = GCNConv(1, 64)
        self.conv2 = GCNConv(64, 32)
        self.fc = torch.nn.Linear(32, 1)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        return torch.sigmoid(self.fc(x))

# Training loop
def train_model(data_list):
    model = DrugGNN()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loader = DataLoader(data_list, batch_size=32, shuffle=True)

    for epoch in range(100):
        total_loss = 0
        for data in loader:
            optimizer.zero_grad()
            out = model(data.x, data.edge_index)
            loss = F.mse_loss(out, data.y.view(-1, 1))  # Ensure correct tensor shape
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(loader)}')

if __name__ == "__main__":
    data_list = preprocess_data('meddra_all_se.tsv', 'drug_names.tsv')
    if data_list:
        train_model(data_list)
    else:
        print("No valid data to train on.")


Error processing amoxapine: 'PUGREST.ServerBusy'


In [7]:
import pandas as pd

drug_names = pd.read_csv('drug_names.tsv', sep='\\t', header=None, names=['stitch_id', 'drug_name'])
# print(drug_names.head())  # Display the first few rows
print(drug_names.columns)
drug_names.columns = drug_names.columns.str.strip()
print(drug_names.columns)
print(drug_names.head())

Index(['stitch_id', 'drug_name'], dtype='object')
Index(['stitch_id', 'drug_name'], dtype='object')
      stitch_id                 drug_name
0  CID100000085                 carnitine
1  CID100000119        gamma-aminobutyric
2  CID100000137          5-aminolevulinic
3  CID100000143                leucovorin
4  CID100000146  5-methyltetrahydrofolate


  drug_names = pd.read_csv('drug_names.tsv', sep='\\t', header=None, names=['stitch_id', 'drug_name'])
