In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import TransformerConv
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, average_precision_score, matthews_corrcoef, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from rdkit import Chem
from sklearn.model_selection import train_test_split

# Load and prepare the dataset
df = pd.read_csv('/content/df (2).csv')  # Load your dataset

def smiles_to_graph(smiles):
    """Convert SMILES string to graph (nodes and edges) using RDKit."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None, None, None

    atoms = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    bonds = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) for bond in mol.GetBonds()]

    edge_index = torch.tensor(bonds, dtype=torch.long).t().contiguous()
    x = torch.tensor(atoms, dtype=torch.float).view(-1, 1)  # Node features

    edge_attr = torch.tensor([bond.GetBondTypeAsDouble() for bond in mol.GetBonds()], dtype=torch.float).view(-1, 1)
    return x, edge_index, edge_attr

def sequence_to_tensor(sequence):
    """Convert protein sequence to tensor."""
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    aa_to_index = {aa: idx for idx, aa in enumerate(amino_acids)}
    indices = [aa_to_index[aa] for aa in sequence if aa in aa_to_index]
    return torch.tensor(indices, dtype=torch.long)

def df_to_data_list(df):
    """Converts a DataFrame to a list of PyTorch Geometric Data objects."""
    data_list = []
    for index, row in df.iterrows():
        x, edge_index, edge_attr = smiles_to_graph(row['smiles'])
        if x is None:
            continue  # Skip rows with invalid SMILES strings
        protein_seq = sequence_to_tensor(row['protein_sequence'])
        interaction_label = torch.tensor([row['interaction_label']], dtype=torch.float)

        data = Data(
            x=x,
            edge_index=edge_index,
            edge_attr=edge_attr,
            y=interaction_label,
            protein_seq=protein_seq
        )
        data_list.append(data)
    return data_list

# Split the dataset
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_data_list = df_to_data_list(train_df)
test_data_list = df_to_data_list(test_df)

# Define the GraphormerDTI model components
class DrugRepresentation(nn.Module):
    def __init__(self, atom_dim, bond_dim, hidden_dim, num_heads, num_layers):
        super(DrugRepresentation, self).__init__()
        self.input_proj = nn.Linear(atom_dim, hidden_dim)
        self.graph_transformer_layers = nn.ModuleList([
            TransformerConv(in_channels=hidden_dim, out_channels=hidden_dim // num_heads, heads=num_heads, edge_dim=bond_dim)
            for _ in range(num_layers)
        ])

    def forward(self, x, edge_index, edge_attr):
        x = self.input_proj(x)
        for layer in self.graph_transformer_layers:
            x = layer(x, edge_index, edge_attr)
        # Mean pooling across nodes to obtain a fixed-size tensor per molecule
        return x.mean(dim=0).unsqueeze(0)

class ProteinRepresentation(nn.Module):
    def __init__(self, embed_dim, hidden_dim, num_layers, vocab_size=20):
        super(ProteinRepresentation, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv_layers = nn.ModuleList([
            nn.Conv1d(embed_dim if i == 0 else hidden_dim, hidden_dim, kernel_size=3, padding=1)
            for i in range(num_layers)
        ])

    def forward(self, protein_seq):
        x = self.embedding(protein_seq)
        if x.dim() == 2:
            x = x.unsqueeze(1)
        x = x.permute(0, 2, 1)  # Prepare for Conv1d

        for conv in self.conv_layers:
            x = F.relu(conv(x))

        # Mean pooling across sequence length to get fixed-size representation
        return x.mean(dim=2)

class InteractionPrediction(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim=1):
        super(InteractionPrediction, self).__init__()
        self.fc_layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, drug_feat, protein_feat):
        x = torch.cat((drug_feat, protein_feat), dim=1)
        return torch.sigmoid(self.fc_layers(x))  # Apply sigmoid for binary classification

class GraphormerDTI(nn.Module):
    def __init__(self, atom_dim, bond_dim, hidden_dim, num_heads, num_layers, protein_embed_dim, protein_hidden_dim, protein_layers, fc_hidden_dim):
        super(GraphormerDTI, self).__init__()
        self.drug_repr = DrugRepresentation(atom_dim, bond_dim, hidden_dim, num_heads, num_layers)
        self.protein_repr = ProteinRepresentation(protein_embed_dim, protein_hidden_dim, protein_layers)
        self.interaction_pred = InteractionPrediction(hidden_dim + protein_hidden_dim, fc_hidden_dim)

    def forward(self, drug_data, protein_seq):
        drug_feat = self.drug_repr(drug_data.x, drug_data.edge_index, drug_data.edge_attr)
        protein_feat = self.protein_repr(protein_seq)
        # Ensure the shapes are aligned for concatenation
        drug_feat = drug_feat.expand(protein_feat.size(0), -1)

        return self.interaction_pred(drug_feat, protein_feat)

# Instantiate the model
model = GraphormerDTI(
    atom_dim=1, bond_dim=1, hidden_dim=128, num_heads=1, num_layers=12,
    protein_embed_dim=32, protein_hidden_dim=128, protein_layers=3, fc_hidden_dim=64
)

# Training and Evaluation Functions
def train_model(model, data_loader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        drug_data = batch
        protein_seq = batch.protein_seq
        label = batch.y

        output = model(drug_data, protein_seq).squeeze()
        loss = criterion(output, label.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

def evaluate_model(model, data_loader):
    model.eval()
    all_labels = []
    all_predictions = []
    with torch.no_grad():
        for batch in data_loader:
            drug_data = batch
            protein_seq = batch.protein_seq
            label = batch.y

            output = model(drug_data, protein_seq).squeeze()
            all_predictions.extend(output.round().cpu().numpy())
            all_labels.extend(label.cpu().numpy())
    return all_labels, all_predictions

# Metrics Calculation
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    aupr = average_precision_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)

    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print("AUC:", auc)
    print("AUPR:", aupr)
    print("MCC:", mcc)

    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['No Interaction', 'Interaction'], yticklabels=['No Interaction', 'Interaction'])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

# Convert datasets to PyTorch Geometric DataLoaders
train_loader = DataLoader(train_data_list, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data_list, batch_size=32, shuffle=False)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
for epoch in range(5):
    train_loss = train_model(model, train_loader, criterion, optimizer)
    print(f"Epoch {epoch+1}, Loss: {train_loss}")

# Evaluate the model
y_true, y_pred = evaluate_model(model, test_loader)
calculate_metrics(y_true, y_pred)
