<a href="https://colab.research.google.com/github/shreekar2005/Cheminformatics_DC/blob/main/Cheminformatics_DC_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
from scipy import io
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load Data using Pandas
y_tr = pd.read_csv('drive/MyDrive/DATASET/tox21_labels_train.csv.gz', index_col=0, compression="gzip")
y_te = pd.read_csv('drive/MyDrive/DATASET/tox21_labels_test.csv.gz', index_col=0, compression="gzip")
x_tr_dense = pd.read_csv('drive/MyDrive/DATASET/tox21_dense_train.csv.gz', index_col=0, compression="gzip").values
x_te_dense = pd.read_csv('drive/MyDrive/DATASET/tox21_dense_test.csv.gz', index_col=0, compression="gzip").values
x_tr_sparse = io.mmread('drive/MyDrive/DATASET/tox21_sparse_train.mtx.gz').tocsc()
x_te_sparse = io.mmread('drive/MyDrive/DATASET/tox21_sparse_test.mtx.gz').tocsc()

# Filter out very sparse features and combine
sparse_col_idx = ((x_tr_sparse > 0).mean(axis=0) > 0.05).ravel()

# Ensure sparse_col_idx has the correct length and applies correctly
x_tr = np.hstack([x_tr_dense, x_tr_sparse[:, sparse_col_idx].toarray()])
x_te = np.hstack([x_te_dense, x_te_sparse[:, sparse_col_idx].toarray()])
# Convert to PyTorch tensors on CPU
device = torch.device("cpu")
x_tr_tensor = torch.tensor(x_tr, dtype=torch.float32)
x_te_tensor = torch.tensor(x_te, dtype=torch.float32)

# Define Transformer Model (CPU version)
class TabularTransformer(nn.Module):
    def __init__(self, input_dim, embed_dim=32, num_heads=2, num_layers=2, num_classes=1):
        super().__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        batch_size = x.shape[0]
        x = self.embedding(x).unsqueeze(1)
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat([cls_tokens, x], dim=1)
        x = self.transformer(x)
        cls_output = x[:, 0, :]
        return torch.sigmoid(self.classifier(cls_output).squeeze(-1))

# Store all confusion matrices and AUCs
conf_matrices = {}
aucs = {}

# Train and evaluate for each target
for target in y_tr.columns:
    # Filter rows with non-missing labels
    rows_tr = y_tr[target].notna().values
    rows_te = y_te[target].notna().values

    if sum(rows_tr) == 0 or sum(rows_te) == 0:
        print(f"Skipping {target} due to no valid samples")
        continue

    # Prepare data
    y_tr_tensor = torch.tensor(y_tr[target][rows_tr].values, dtype=torch.float32)
    train_dataset = TensorDataset(x_tr_tensor[rows_tr], y_tr_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # Initialize model
    model = TabularTransformer(input_dim=x_tr.shape[1], num_classes=1)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    # Training loop
    model.train()
    for epoch in range(10):
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        test_outputs = model(x_te_tensor[rows_te]).squeeze().numpy()

    # Compute AUC
    auc_te = roc_auc_score(y_te[target][rows_te], test_outputs)
    aucs[target] = auc_te
    print(f"{target}: AUC = {auc_te:.5f}")

    # Confusion matrix
    p_te_pred = (test_outputs > 0.5).astype(int)
    conf_matrices[target] = confusion_matrix(y_te[target][rows_te], p_te_pred)

# Plot confusion matrices
fig, axes = plt.subplots(3, 4, figsize=(15, 10))
axes = axes.ravel()
for i, (target, cm) in enumerate(conf_matrices.items()):
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=["Negative", "Positive"],
                yticklabels=["Negative", "Positive"],
                ax=axes[i])
    axes[i].set_title(f"{target} (AUC = {aucs[target]:.2f})")
    axes[i].set_xlabel("Predicted")
    axes[i].set_ylabel("Actual")
plt.tight_layout()
plt.show()


IndexError: boolean column index has incorrect length: 1 instead of 272776